From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Mon, 14 Dec 2020 05:43:43 +0000 (+0900)
Subject: Imported Upstream version 1.12.0
X-Git-Tag: upstream/1.12.0^0
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=62529acabbafce7730601ed01d5709d7bc0d378a;p=platform%2Fcore%2Fml%2Fnnfw.git

Imported Upstream version 1.12.0
---

diff --git a/.clang-format b/.clang-format
index 7dcf11c..5699ccf 100644
--- a/.clang-format
+++ b/.clang-format
@@ -23,16 +23,16 @@ BinPackParameters: true
 BraceWrapping:
   AfterClass:      true
   AfterControlStatement: true
-  AfterEnum:       false
+  AfterEnum:       true
   AfterFunction:   true
-  AfterNamespace:  false
+  AfterNamespace:  true
   AfterObjCDeclaration: false
   AfterStruct:     true
-  AfterUnion:      true
+  AfterUnion:      false
   BeforeCatch:     true
   BeforeElse:      true
   IndentBraces:    false
-BreakBeforeBraces: Allman
+BreakBeforeBraces: Custom
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakAfterJavaFieldAnnotations: false
diff --git a/.clang-format.8 b/.clang-format.8
new file mode 100644
index 0000000..d2db976
--- /dev/null
+++ b/.clang-format.8
@@ -0,0 +1,92 @@
+Language:        Cpp
+BasedOnStyle: Google
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignEscapedNewlinesLeft: true
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:             true
+  AfterControlStatement:  true
+  AfterEnum:              true
+  AfterFunction:          true
+  AfterNamespace:         true
+  AfterObjCDeclaration:   false
+  AfterStruct:            true
+  AfterUnion:             false
+  AfterExternBlock:       false
+  BeforeCatch:            true
+  BeforeElse:             true
+  IndentBraces:           false
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        2
+UseTab:          Never
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
index b71a374..8544bbd 100644
--- a/compiler/bcq-tools/generate_bcq_output_arrays
+++ b/compiler/bcq-tools/generate_bcq_output_arrays
@@ -112,128 +112,22 @@ def print_bcqinfo_output_arrays_v1(flags):
             if infoname == "bcqinfo_dequant_weight":
                 has_dequant_weight = True
 
-    # Ideal situation is that the user nodes of BCQ applicable constant nodes
-    # are BCQ applicable operations such as MatMul, GatherV2, etc.
-    # However, operations which do not change original values such as
-    # Ideneity or Transpose can exist between them. In view of TensorFlow Lite,
-    # real user nodes of BCQ applicable constant nodes must be found first.
-    # This work is done by BFS search with queue.
-
-    prefix_node_dict = {}  # key : prefix / value : list of candidates
-    matmul_node_prefix_dict = {}  # key : Name of MatMul node / value : prefix
-
-    queue_prefix = list(prefix_set)
-    queue_nodename = [queue_prefix[idx] + ":0" for idx in range(len(queue_prefix))]
-
-    while len(queue_prefix) > 0:
-        prefix = queue_prefix.pop(0)
-        nodename = queue_nodename.pop(0)
-        if prefix not in prefix_node_dict.keys():
-            prefix_node_dict[prefix] = []
-
-        # Usually, output name of op is like "outputname:0"
-        # -2 is for removing ":0"
-        for op in ops:
-            if op.type == "MatMul" and (op.inputs[0].name == nodename
-                                        or op.inputs[1].name == nodename):
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-                matmul_node_prefix_dict[op.outputs[0].name[:-2]] = prefix
-            elif op.type == "Einsum" and (op.inputs[0].name == nodename
-                                          or op.inputs[1].name == nodename):
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-            elif op.type == "GatherV2" and op.inputs[0].name == nodename:
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-            elif len(op.outputs) == 1:
-                for i in range(len(op.inputs)):
-                    if op.inputs[i].name == nodename:
-                        queue_prefix.append(prefix)
-                        queue_nodename.append(op.outputs[0].name)
-                        break
-
-    # When TensorFlow model is converted to TensorFlow Lite model,
-    # more than one operation can be fused as one.
-    # For example, MatMul + BiasAdd + ReLU in TensorFlow can be fused as
-    # one FullyConnected in TensorFlow Lite.
-    # It means that even real user nodes of BCQ applicable constant nodes
-    # in TensorFlow are found, they may be real user nodes in TensorFlow Lite.
-    # Therefore additional candidates of real user nodes should be found either.
-    # Finding additional candidates is done by BFS search with queue.
-
-    fuseop_prefix_dict = {}  # key : Candidate operation / Value : prefix
-
-    # These ops can be candidate. However other candidates may exists after these ops.
-    mark_type = ["Add", "AddV2", "BiasAdd", "Reshape", "Transpose"]
-
-    # These ops can be candidate. And no more candidates will be found after these ops.
-    mark_and_stop_type = ["Relu", "Relu6", "Tanh"]
-
-    # These ops cannot be candidates but other candidates may exists after these ops.
-    # NOTE : Some of following ops may be removed from the list but not sure for now.
-    pass_type = [
-        "BatchToSpaceND", "Cast", "DepthToSpace", "ExpandDims", "ResizeBilinear",
-        "ResizeNearestNeighbor", "ScatterNd", "SpaceToBatchND", "SpaceToDepth", "Squeeze",
-        "Identity", "Pack", "Unpack", "Stack"
-    ]
-
-    queue_prefix = list(matmul_node_prefix_dict.values())
-    queue_nodename = [matmul + ":0" for matmul in matmul_node_prefix_dict.keys()]
-
-    visited_nodes = set(queue_nodename)
-    while len(queue_prefix) > 0:
-        prefix = queue_prefix.pop(0)
-        nodename = queue_nodename.pop(0)
-
-        # Usually, output name of op is like "outputname:0"
-        # -2 is for removing ":0"
-        for op in ops:
-            for i in range(len(op.inputs)):
-                if nodename == op.inputs[i].name:
-                    if op.type in mark_type:
-                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
-                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
-                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
-                        if op.outputs[0].name not in visited_nodes:
-                            queue_prefix.append(prefix)
-                            queue_nodename.append(op.outputs[0].name)
-                            visited_nodes.add(op.outputs[0].name)
-                    elif op.type in mark_and_stop_type:
-                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
-                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
-                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
-                    elif op.type in pass_type and op.outputs[0].name not in visited_nodes:
-                        queue_prefix.append(prefix)
-                        queue_nodename.append(op.outputs[0].name)
-                        visited_nodes.add(op.outputs[0].name)
-
     # Write the name of metadata node
     with open(flags.metadata_path, 'w') as f_metadata:
         f_metadata.write("one_compiler/bcqinfo_one_metadata,")
 
-    # Write all pairs of candidate operations and related BCQ information nodes.
+    # Write all pairs of a constant node and related BCQ information nodes.
     with open(flags.output_arrays_path, 'w') as f_arrays:
         for prefix in prefix_set:
-            for fusable_op in prefix_node_dict[prefix]:
-                f_arrays.write("," + prefix + "/bcqinfo_do_w_x")
-                f_arrays.write("," + prefix + "/bcqinfo_alpha")
-                f_arrays.write("," + prefix + "/bcqinfo_packed_binary_code")
-                f_arrays.write("," + prefix + "/bcqinfo_number_of_clusters")
-                f_arrays.write("," + prefix + "/bcqinfo_size_of_clusters")
-                f_arrays.write("," + prefix + "/bcqinfo_qbits_of_clusters")
-                f_arrays.write("," + fusable_op)
-                if has_dequant_weight:
-                    f_arrays.write("," + prefix + "/bcqinfo_dequant_weight")
-        for fuseop in fuseop_prefix_dict.keys():
-            if len(fuseop_prefix_dict[fuseop]) == 1:
-                prefix = fuseop_prefix_dict[fuseop].pop()
-                f_arrays.write("," + prefix + "/bcqinfo_do_w_x")
-                f_arrays.write("," + prefix + "/bcqinfo_alpha")
-                f_arrays.write("," + prefix + "/bcqinfo_packed_binary_code")
-                f_arrays.write("," + prefix + "/bcqinfo_number_of_clusters")
-                f_arrays.write("," + prefix + "/bcqinfo_size_of_clusters")
-                f_arrays.write("," + prefix + "/bcqinfo_qbits_of_clusters")
-                f_arrays.write("," + fuseop)
-                if has_dequant_weight:
-                    f_arrays.write("," + prefix + "/bcqinfo_dequant_weight")
+            f_arrays.write("," + prefix + "/bcqinfo_do_w_x")
+            f_arrays.write("," + prefix + "/bcqinfo_alpha")
+            f_arrays.write("," + prefix + "/bcqinfo_packed_binary_code")
+            f_arrays.write("," + prefix + "/bcqinfo_number_of_clusters")
+            f_arrays.write("," + prefix + "/bcqinfo_size_of_clusters")
+            f_arrays.write("," + prefix + "/bcqinfo_qbits_of_clusters")
+            f_arrays.write("," + prefix)
+            if has_dequant_weight:
+                f_arrays.write("," + prefix + "/bcqinfo_dequant_weight")
 
 
 def print_bcq_output_arrays(flags):
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays.py b/compiler/bcq-tools/generate_bcq_output_arrays.py
index 0cc1318..5d9fbe6 100644
--- a/compiler/bcq-tools/generate_bcq_output_arrays.py
+++ b/compiler/bcq-tools/generate_bcq_output_arrays.py
@@ -81,129 +81,23 @@ def get_bcqinfo_output_arrays_v1(input_path, output_arrays):
             if infoname == "bcqinfo_dequant_weight":
                 has_dequant_weight = True
 
-    # Ideal situation is that the user nodes of BCQ applicable constant nodes
-    # are BCQ applicable operations such as MatMul, GatherV2, etc.
-    # However, operations which do not change original values such as
-    # Ideneity or Transpose can exist between them. In view of TensorFlow Lite,
-    # real user nodes of BCQ applicable constant nodes must be found first.
-    # This work is done by BFS search with queue.
-
-    prefix_node_dict = {}  # key : prefix / value : list of candidates
-    matmul_node_prefix_dict = {}  # key : Name of MatMul node / value : prefix
-
-    queue_prefix = list(prefix_set)
-    queue_nodename = [queue_prefix[idx] + ":0" for idx in range(len(queue_prefix))]
-
-    while len(queue_prefix) > 0:
-        prefix = queue_prefix.pop(0)
-        nodename = queue_nodename.pop(0)
-        if prefix not in prefix_node_dict.keys():
-            prefix_node_dict[prefix] = []
-
-        # Usually, output name of op is like "outputname:0"
-        # -2 is for removing ":0"
-        for op in ops:
-            if op.type == "MatMul" and (op.inputs[0].name == nodename
-                                        or op.inputs[1].name == nodename):
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-                matmul_node_prefix_dict[op.outputs[0].name[:-2]] = prefix
-            elif op.type == "Einsum" and (op.inputs[0].name == nodename
-                                          or op.inputs[1].name == nodename):
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-            elif op.type == "GatherV2" and op.inputs[0].name == nodename:
-                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
-            elif len(op.outputs) == 1:
-                for i in range(len(op.inputs)):
-                    if op.inputs[i].name == nodename:
-                        queue_prefix.append(prefix)
-                        queue_nodename.append(op.outputs[0].name)
-                        break
-
-    # When TensorFlow model is converted to TensorFlow Lite model,
-    # more than one operation can be fused as one.
-    # For example, MatMul + BiasAdd + ReLU in TensorFlow can be fused as
-    # one FullyConnected in TensorFlow Lite.
-    # It means that even real user nodes of BCQ applicable constant nodes
-    # in TensorFlow are found, they may be real user nodes in TensorFlow Lite.
-    # Therefore additional candidates of real user nodes should be found either.
-    # Finding additional candidates is done by BFS search with queue.
-
-    fuseop_prefix_dict = {}  # key : Candidate operation / Value : prefix
-
-    # These ops can be candidate. However other candidates may exists after these ops.
-    mark_type = ["Add", "AddV2", "BiasAdd", "Reshape", "Transpose"]
-
-    # These ops can be candidate. And no more candidates will be found after these ops.
-    mark_and_stop_type = ["Relu", "Relu6", "Tanh"]
-
-    # These ops cannot be candidates but other candidates may exists after these ops.
-    # NOTE : Some of following ops may be removed from the list but not sure for now.
-    pass_type = [
-        "BatchToSpaceND", "Cast", "DepthToSpace", "ExpandDims", "ResizeBilinear",
-        "ResizeNearestNeighbor", "ScatterNd", "SpaceToBatchND", "SpaceToDepth", "Squeeze",
-        "Identity", "Pack", "Unpack", "Stack"
-    ]
-
-    queue_prefix = list(matmul_node_prefix_dict.values())
-    queue_nodename = [matmul + ":0" for matmul in matmul_node_prefix_dict.keys()]
-
-    visited_nodes = set(queue_nodename)
-    while len(queue_prefix) > 0:
-        prefix = queue_prefix.pop(0)
-        nodename = queue_nodename.pop(0)
-
-        # Usually, output name of op is like "outputname:0"
-        # -2 is for removing ":0"
-        for op in ops:
-            for i in range(len(op.inputs)):
-                if nodename == op.inputs[i].name:
-                    if op.type in mark_type:
-                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
-                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
-                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
-                        if op.outputs[0].name not in visited_nodes:
-                            queue_prefix.append(prefix)
-                            queue_nodename.append(op.outputs[0].name)
-                            visited_nodes.add(op.outputs[0].name)
-                    elif op.type in mark_and_stop_type:
-                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
-                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
-                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
-                    elif op.type in pass_type and op.outputs[0].name not in visited_nodes:
-                        queue_prefix.append(prefix)
-                        queue_nodename.append(op.outputs[0].name)
-                        visited_nodes.add(op.outputs[0].name)
-
     # the name of metadata node
     ret_output_arrays = ['one_compiler/bcqinfo_one_metadata']
 
     # given node from user
-    ret_output_arrays.append(output_arrays)
+    ret_output_arrays += output_arrays.split(',')
 
-    # all pairs of candidate operations and related BCQ information nodes
+    # all pairs of a constant node and related BCQ information nodes.
     for prefix in prefix_set:
-        for fusable_op in prefix_node_dict[prefix]:
-            ret_output_arrays.append(prefix + '/bcqinfo_do_w_x')
-            ret_output_arrays.append(prefix + '/bcqinfo_alpha')
-            ret_output_arrays.append(prefix + '/bcqinfo_packed_binary_code')
-            ret_output_arrays.append(prefix + '/bcqinfo_number_of_clusters')
-            ret_output_arrays.append(prefix + '/bcqinfo_size_of_clusters')
-            ret_output_arrays.append(prefix + '/bcqinfo_qbits_of_clusters')
-            ret_output_arrays.append(fusable_op)
-            if has_dequant_weight:
-                ret_output_arrays.append(prefix + '/bcqinfo_dequant_weight')
-    for fuseop in fuseop_prefix_dict.keys():
-        if len(fuseop_prefix_dict[fuseop]) == 1:
-            prefix = fuseop_prefix_dict[fuseop].pop()
-            ret_output_arrays.append(prefix + '/bcqinfo_do_w_x')
-            ret_output_arrays.append(prefix + '/bcqinfo_alpha')
-            ret_output_arrays.append(prefix + '/bcqinfo_packed_binary_code')
-            ret_output_arrays.append(prefix + '/bcqinfo_number_of_clusters')
-            ret_output_arrays.append(prefix + '/bcqinfo_size_of_clusters')
-            ret_output_arrays.append(prefix + '/bcqinfo_qbits_of_clusters')
-            ret_output_arrays.append(fuseop)
-            if has_dequant_weight:
-                ret_output_arrays.append(prefix + '/bcqinfo_dequant_weight')
+        ret_output_arrays.append(prefix + '/bcqinfo_do_w_x')
+        ret_output_arrays.append(prefix + '/bcqinfo_alpha')
+        ret_output_arrays.append(prefix + '/bcqinfo_packed_binary_code')
+        ret_output_arrays.append(prefix + '/bcqinfo_number_of_clusters')
+        ret_output_arrays.append(prefix + '/bcqinfo_size_of_clusters')
+        ret_output_arrays.append(prefix + '/bcqinfo_qbits_of_clusters')
+        ret_output_arrays.append(prefix)
+        if has_dequant_weight:
+            ret_output_arrays.append(prefix + '/bcqinfo_dequant_weight')
 
     return ret_output_arrays
 
@@ -216,7 +110,7 @@ def get_bcq_output_arrays(input_path, output_arrays):
     if model_version == 1:
         return get_bcqinfo_output_arrays_v1(input_path, output_arrays)
     elif model_version == -1:
-        return None
+        return output_arrays.split(',')
     else:
         err_msg = "BCQ version of the model(v{}) ".format(model_version)
         err_msg += "is higher than "
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 20e3ea9..cde5de8 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -110,6 +110,18 @@ int entry(int argc, char **argv)
       .default_value(false)
       .help("This will fuse BatchNorm operators of pre-activations to Convolution operator");
 
+  arser.add_argument("--remove_redundant_transpose")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will fuse or remove subsequent Transpose operators");
+
+  arser.add_argument("--replace_cw_mul_add_with_depthwise_conv")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will replace channel-wise mul/add with DepthwiseConv2D operator");
+
   arser.add_argument("--resolve_customop_add")
       .nargs(0)
       .required(false)
@@ -128,6 +140,19 @@ int entry(int argc, char **argv)
       .default_value(false)
       .help("This will convert Custom(Matmul) to Matmul operator");
 
+  arser.add_argument("--shuffle_weight_to_16x1float32")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32. Note that "
+            "it only converts weights whose row is a multiple of 16");
+
+  arser.add_argument("--substitute_pack_to_reshape")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will convert single input Pack to Reshape");
+
   arser.add_argument("--mute_warnings")
       .nargs(0)
       .required(false)
@@ -196,6 +221,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::ResolveCustomOpAdd);
     options->enable(Algorithms::ResolveCustomOpBatchMatMul);
     options->enable(Algorithms::ResolveCustomOpMatMul);
+    options->enable(Algorithms::RemoveRedundantTranspose);
+    options->enable(Algorithms::SubstitutePackToReshape);
   }
   if (arser.get<bool>("--fold_dequantize"))
     options->enable(Algorithms::FoldDequantize);
@@ -213,12 +240,20 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::MakeBatchNormGammaPositive);
   if (arser.get<bool>("--fuse_preactivation_batchnorm"))
     options->enable(Algorithms::FusePreActivationBatchNorm);
+  if (arser.get<bool>("--remove_redundant_transpose"))
+    options->enable(Algorithms::RemoveRedundantTranspose);
+  if (arser.get<bool>("--replace_cw_mul_add_with_depthwise_conv"))
+    options->enable(Algorithms::ReplaceMulAddWithDepthwiseConv);
   if (arser.get<bool>("--resolve_customop_add"))
     options->enable(Algorithms::ResolveCustomOpAdd);
   if (arser.get<bool>("--resolve_customop_batchmatmul"))
     options->enable(Algorithms::ResolveCustomOpBatchMatMul);
   if (arser.get<bool>("--resolve_customop_matmul"))
     options->enable(Algorithms::ResolveCustomOpMatMul);
+  if (arser.get<bool>("--shuffle_weight_to_16x1float32"))
+    options->enable(Algorithms::ShuffleWeightTo16x1Float32);
+  if (arser.get<bool>("--substitute_pack_to_reshape"))
+    options->enable(Algorithms::SubstitutePackToReshape);
 
   if (arser.get<bool>("--mute_warnings"))
     settings->set(luci::UserSettings::Key::MuteWarnings, true);
@@ -281,11 +316,14 @@ int entry(int argc, char **argv)
   luci::Importer importer;
   auto module = importer.importModule(circle_model);
 
+  // call luci optimizations for module
+  optimizer.optimize(module.get());
+
   for (size_t idx = 0; idx < module->size(); ++idx)
   {
     auto graph = module->graph(idx);
 
-    // call luci optimizations
+    // call luci optimizations for graph
     optimizer.optimize(graph);
     optimizer.sparsify(graph);
 
diff --git a/compiler/circlechef/tests/CMakeLists.txt b/compiler/circlechef/tests/CMakeLists.txt
index 4dc58ad..773ff54 100644
--- a/compiler/circlechef/tests/CMakeLists.txt
+++ b/compiler/circlechef/tests/CMakeLists.txt
@@ -26,6 +26,32 @@ foreach(RECIPE IN ITEMS ${RECIPES})
   list(APPEND TESTFILES ${RECIPE_OUTPUT_FILE})
 endforeach(RECIPE)
 
+# Add local files
+file(GLOB RECIPES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*/test.recipe")
+
+foreach(RECIPE IN ITEMS ${RECIPES})
+  get_filename_component(RECIPE_PREFIX ${RECIPE} DIRECTORY)
+
+  set(RECIPE_SOURCE_FILE "${RECIPE_PREFIX}.recipe")
+  set(RECIPE_OUTPUT_FILE "${RECIPE_PREFIX}.circle")
+
+  # Copy .recipe
+  add_custom_command(OUTPUT ${RECIPE_SOURCE_FILE}
+                     COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                             "${CMAKE_CURRENT_SOURCE_DIR}/${RECIPE}" ${RECIPE_SOURCE_FILE}
+                     DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RECIPE}"
+                     COMMENT "Generating ${RECIPE_SOURCE_FILE}")
+
+  # Generate .circle
+  add_custom_command(OUTPUT ${RECIPE_OUTPUT_FILE}
+                     COMMAND circlechef-file ${RECIPE_SOURCE_FILE} ${RECIPE_OUTPUT_FILE}
+                     DEPENDS circlechef-file ${RECIPE_SOURCE_FILE}
+                     COMMENT "Generating ${RECIPE_OUTPUT_FILE}")
+
+  list(APPEND TESTS ${RECIPE_PREFIX})
+  list(APPEND TESTFILES ${RECIPE_OUTPUT_FILE})
+endforeach(RECIPE)
+
 #Test circlechef-reverse
 file(GLOB GEN_CIRCLEFILES RELATIVE ${CIRCLERECIPES_DIR} "${CIRCLERECIPES_DIR}/*/test.reverse")
 # Note: While in development, circlechef-reverse may not handle the operator.
@@ -58,6 +84,31 @@ foreach(CIRCLEFILE IN ITEMS ${GEN_CIRCLEFILES})
   list(APPEND TESTFILES ${RECIPE_GEN_OUTPUT_FILE2})
 endforeach(CIRCLEFILE)
 
+# Test local circlechef-reverse
+file(GLOB GEN_CIRCLEFILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*/test.reverse")
+
+foreach(CIRCLEFILE IN ITEMS ${GEN_CIRCLEFILES})
+  get_filename_component(CIRCLE_PREFIX ${CIRCLEFILE} DIRECTORY)
+
+  set(RECIPE_OUTPUT_FILE "${CIRCLE_PREFIX}.circle")
+  set(RECIPE_GEN_OUTPUT_FILE "${CIRCLE_PREFIX}.gen.recipe")
+  set(RECIPE_GEN_OUTPUT_FILE2 "${CIRCLE_PREFIX}.gen.circle")
+
+  # Generate .gen.recipe from generated .circle
+  add_custom_command(OUTPUT ${RECIPE_GEN_OUTPUT_FILE}
+                     COMMAND circlechef-reverse ${RECIPE_OUTPUT_FILE} ${RECIPE_GEN_OUTPUT_FILE}
+                     DEPENDS circlechef-reverse ${RECIPE_OUTPUT_FILE}
+                     COMMENT "Generating ${RECIPE_GEN_OUTPUT_FILE}")
+
+  add_custom_command(OUTPUT ${RECIPE_GEN_OUTPUT_FILE2}
+                     COMMAND circlechef-file ${RECIPE_GEN_OUTPUT_FILE} ${RECIPE_GEN_OUTPUT_FILE2}
+                     DEPENDS circlechef-file ${RECIPE_GEN_OUTPUT_FILE}
+                     COMMENT "Generating ${RECIPE_GEN_OUTPUT_FILE2}")
+
+  list(APPEND TESTS ${CIRCLE_PREFIX}.gen)
+  list(APPEND TESTFILES ${RECIPE_GEN_OUTPUT_FILE2})
+endforeach(CIRCLEFILE)
+
 # Add a dummy target to create a target-level dependency.
 # TODO Find a way to create a dependency between circlechef_test and generated testfiles.
 add_custom_target(circlechef_testfiles ALL DEPENDS ${TESTFILES})
diff --git a/compiler/circlechef/tests/shape_signature/test.recipe b/compiler/circlechef/tests/shape_signature/test.recipe
new file mode 100644
index 0000000..37968ab
--- /dev/null
+++ b/compiler/circlechef/tests/shape_signature/test.recipe
@@ -0,0 +1,45 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 6 dim: 12 }
+  shape_signature { dim: -1 dim: 8 dim: 6 dim: 12 }
+}
+operand {
+  name: "gamma"
+  type: FLOAT32
+  shape { dim: 12 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "beta"
+  type: FLOAT32
+  shape { dim: 12 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 6 dim: 12 }
+  shape_signature { dim: -1 dim: 8 dim: 6 dim: 12 }
+}
+operation {
+  type: "InstanceNorm"
+  input: "ifm"
+  input: "gamma"
+  input: "beta"
+  output: "ofm"
+  instance_norm_options {
+    epsilon: 0.00001
+    activation: NONE
+  }
+}
+input: "ifm"
+output: "ofm"
diff --git a/compiler/circlechef/tests/shape_signature/test.reverse b/compiler/circlechef/tests/shape_signature/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index b2abfd5..34a4d2c 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -16,10 +16,6 @@ tcgenerate(AddN_000)
 tcgenerate(Add_001) # runtime doesn't support
 tcgenerate(Add_U8_000)
 tcgenerate(All_000)
-tcgenerate(ArgMax_U8_000) 
-tcgenerate(ArgMax_U8_001)
-tcgenerate(ArgMax_U8_002)
-tcgenerate(ArgMax_U8_003)
 tcgenerate(ArgMin_000)
 tcgenerate(ArgMin_001)
 tcgenerate(ArgMin_002)
@@ -35,58 +31,35 @@ tcgenerate(BatchToSpaceND_000)
 tcgenerate(Cast_000)
 tcgenerate(Cast_001)
 tcgenerate(Ceil_000)
-tcgenerate(Concatenation_U8_000)
 tcgenerate(Conv2D_003) # runtime doesn't support dilation
-tcgenerate(Conv2D_U8_000)
-tcgenerate(Conv2D_U8_001)
 tcgenerate(Cos_000)
 tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
-tcgenerate(DepthwiseConv2D_U8_000)
 tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
 tcgenerate(Dequantize_000)  # runtime and luci-interpreter doesn't support Dequantize op yet
-tcgenerate(Div_000)
-tcgenerate(Equal_000)
-tcgenerate(Exp_000)
 tcgenerate(ExpandDims_000)
 tcgenerate(ExpandDims_001)
 tcgenerate(ExpandDims_002)
 tcgenerate(ExpandDims_003)
 tcgenerate(Fill_000)
 tcgenerate(Fill_001)
-tcgenerate(Floor_000)
-tcgenerate(FloorDiv_000)
-tcgenerate(FloorDiv_001)
 tcgenerate(FloorMod_000)
 tcgenerate(FloorMod_001)
-tcgenerate(FullyConnected_002)
 tcgenerate(FullyConnected_U8_000)
 tcgenerate(Gather_000)
 tcgenerate(GatherNd_000)
 tcgenerate(GatherNd_001)
-tcgenerate(Greater_000)
-tcgenerate(GreaterEqual_000)
 tcgenerate(If_000)
 tcgenerate(If_001)
 tcgenerate(L2Pool2D_U8_000)
-tcgenerate(Less_000)
-tcgenerate(LessEqual_000)
 tcgenerate(Log_000)
-tcgenerate(LogicalAnd_000)
-tcgenerate(LogicalNot_000)
-tcgenerate(LogicalOr_000)
-tcgenerate(LogSoftmax_000)
 tcgenerate(MatMul_000)
 tcgenerate(MatrixBandPart_000)
 tcgenerate(MatrixDiag_000)
 tcgenerate(MatrixSetDiag_000)
-tcgenerate(Maximum_000)
-tcgenerate(MaxPool2D_U8_000)
 tcgenerate(MaxPoolWithArgMax_000)
 tcgenerate(MaxPoolWithArgMax_001)
 tcgenerate(MaxPoolWithArgMax_002)
-tcgenerate(Mean_U8_000)
-tcgenerate(Minimum_000)
 tcgenerate(NonMaxSuppressionV4_000)
 tcgenerate(NonMaxSuppressionV4_001)
 tcgenerate(NonMaxSuppressionV5_000)
@@ -99,36 +72,38 @@ tcgenerate(Net_InstanceNorm_001)
 tcgenerate(Net_InstanceNorm_002)
 tcgenerate(Net_InstanceNorm_003)
 tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
-tcgenerate(NotEqual_000)
 tcgenerate(OneHot_000)
 tcgenerate(OneHot_001)
 tcgenerate(OneHot_002)
 tcgenerate(OneHot_003)
 tcgenerate(Pack_000)
 tcgenerate(Pack_U8_000)
-tcgenerate(Pad_U8_000)
 tcgenerate(PadV2_000)
-tcgenerate(Pow_000)
 tcgenerate(Range_000)
 tcgenerate(Rank_000)
 tcgenerate(ReduceAny_000)
 tcgenerate(ReduceAny_001)
 tcgenerate(ReduceAny_002)
 tcgenerate(ReduceAny_003)
+tcgenerate(ReduceAny_dynamic_000)
+tcgenerate(ReduceAny_dynamic_001)
+tcgenerate(ReduceAny_dynamic_002)
+tcgenerate(ReduceAny_dynamic_003)
 tcgenerate(ReduceMax_000)
+tcgenerate(ReduceMax_dynamic_000)
 tcgenerate(ReduceMin_000)
+tcgenerate(ReduceMin_dynamic_000)
 tcgenerate(ReduceProd_000)
 tcgenerate(ReduceProd_001)
 tcgenerate(ReduceProd_002)
 tcgenerate(ReduceProd_003)
-tcgenerate(ReLU_000)
-tcgenerate(ReLU6_000)
+tcgenerate(ReduceProd_dynamic_000)
+tcgenerate(ReduceProd_dynamic_001)
+tcgenerate(ReduceProd_dynamic_002)
+tcgenerate(ReduceProd_dynamic_003)
 tcgenerate(ReLUN1To1_000)
+tcgenerate(ReLUN1To1_dynamic_000)
 tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
-tcgenerate(Reshape_U8_000)
-tcgenerate(ResizeBilinear_000)
-tcgenerate(ResizeBilinear_U8_000) # luci-interpreter
-tcgenerate(ResizeNearestNeighbor_000)
 tcgenerate(ReverseSequence_000)
 tcgenerate(ReverseV2_000)
 tcgenerate(Round_000)
@@ -142,7 +117,6 @@ tcgenerate(SelectV2_001)
 tcgenerate(SelectV2_002)
 tcgenerate(Shape_000)
 tcgenerate(Sin_000)
-tcgenerate(Softmax_U8_000)
 tcgenerate(SpaceToBatchND_000)
 tcgenerate(SpaceToBatchND_001)
 tcgenerate(SpaceToBatchND_002)
@@ -151,11 +125,10 @@ tcgenerate(SparseToDense_000)
 tcgenerate(SplitV_000)
 tcgenerate(Square_000)
 tcgenerate(SquaredDifference_000)
-tcgenerate(Sub_000)
-tcgenerate(Sub_001)
-tcgenerate(Sub_U8_000)
 tcgenerate(Sum_000)
 tcgenerate(Sum_001)
+tcgenerate(Sum_dynamic_000)
+tcgenerate(Sum_dynamic_001)
 tcgenerate(Tile_000)
 tcgenerate(Tile_U8_000)
 tcgenerate(TopKV2_000)
@@ -184,3 +157,4 @@ tcgenerate(BCQFullyConnected_001)
 tcgenerate(BCQGather_000)
 tcgenerate(CircleBatchMatMul_000)
 tcgenerate(InstanceNorm_000)
+tcgenerate(InstanceNorm_001)
diff --git a/compiler/exo/src/Circle/CircleExporterUtils.h b/compiler/exo/src/Circle/CircleExporterUtils.h
index fdd162b..78f0cf7 100644
--- a/compiler/exo/src/Circle/CircleExporterUtils.h
+++ b/compiler/exo/src/Circle/CircleExporterUtils.h
@@ -65,7 +65,7 @@ namespace circle_detail
 {
 
 /**
- * @breif Record the information of T/F Lite SubGraph and its mapping to loco
+ * @brief Record the information of T/F Lite SubGraph and its mapping to loco
  */
 struct SubGraphContext
 {
diff --git a/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.cpp b/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.cpp
index f4bb103..26cc561 100644
--- a/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.cpp
+++ b/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.cpp
@@ -116,7 +116,7 @@ private:
 };
 
 /**
- * @breif  Expand shape x and y to same rank by align right and filling with 1
+ * @brief  Expand shape x and y to same rank by align right and filling with 1
  */
 void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 {
@@ -136,7 +136,7 @@ void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 }
 
 /**
- * @breif  Returns shape of expanded dimension of input x and y having same rank
+ * @brief  Returns shape of expanded dimension of input x and y having same rank
  */
 loco::TensorShape expand_dimension(const loco::TensorShape &x, const loco::TensorShape &y)
 {
diff --git a/compiler/exo/src/TFLite/TFLExporterUtils.h b/compiler/exo/src/TFLite/TFLExporterUtils.h
index dbd7a52..f2fe607 100644
--- a/compiler/exo/src/TFLite/TFLExporterUtils.h
+++ b/compiler/exo/src/TFLite/TFLExporterUtils.h
@@ -65,7 +65,7 @@ namespace tflite_detail
 {
 
 /**
- * @breif Record the information of T/F Lite SubGraph and its mapping to loco
+ * @brief Record the information of T/F Lite SubGraph and its mapping to loco
  */
 struct SubGraphContext
 {
diff --git a/compiler/hermes/include/hermes/core/Message.h b/compiler/hermes/include/hermes/core/Message.h
index 28cfd79..460163f 100644
--- a/compiler/hermes/include/hermes/core/Message.h
+++ b/compiler/hermes/include/hermes/core/Message.h
@@ -37,7 +37,7 @@ public:
 public:
   /// @brief The number of lines
   uint32_t lines(void) const { return _lines.size(); }
-  /// @breif The content of a specific line
+  /// @brief The content of a specific line
   const std::string &line(uint32_t n) const { return _lines.at(n); }
 
 private:
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
index 47e2498..c5069e4 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
@@ -135,7 +135,17 @@ void Conv2D::execute() const
       }
       throw std::runtime_error("Unsupported type.");
     case DataType::U8:
-      evalQuantized();
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(0)));
+        evalQuantizedPerChannel();
+      }
       break;
     case DataType::S16:
       evalQuantizedS16();
@@ -219,6 +229,92 @@ void Conv2D::evalQuantized() const
       getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
 }
 
+void Conv2D::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  const std::vector<double> effective_output_scale =
+      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  const std::vector<ChannelQuantMultipliers> multipliers_raw =
+      quantizeMultipliers(effective_output_scale);
+  BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
+
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          const int32_t in_y_origin = out_y * stride_height - _padding_height;
+          const int32_t in_x_origin = out_x * stride_width - _padding_width;
+          int32_t acc = 0;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
+              const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
+              if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
+              {
+                for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+                {
+                  const uint8_t input_val =
+                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const uint8_t filter_val =
+                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  acc += static_cast<int32_t>(input_val - input()->zero_point()) *
+                         static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
+                }
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+              acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
+
+          scaled_acc += output()->zero_point();
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
 void Conv2D::evalQuantizedS16() const
 {
   const auto *input_data = getTensorData<int16_t>(input());
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-interpreter/src/kernels/Conv2D.h
index 83ac67d..86f73c2 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.h
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.h
@@ -44,6 +44,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
   void evalQuantizedS16() const;
 
 private:
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
index 7aa66a8..35a0c54 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
@@ -169,6 +169,78 @@ TEST(Conv2DTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
+TEST(Conv2DTest, Uint8_CWQ)
+{
+  const int output_channels = 3;
+  std::vector<float> input_data{
+      // First batch
+      1, 1, 1, 1, // row = 1
+      2, 2, 2, 2, // row = 2
+                  // Second batch
+      1, 2, 3, 4, // row = 1
+      1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+      1,  2,  3,  4, // first 2x2 filter
+      -1, 1,  -1, 1, // second 2x2 filter
+      -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Shape filter_shape{output_channels, 2, 2, 1};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(0, 4);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 4));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-1, 1));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-1, 1));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor filter_tensor =
+      makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 0, filter_data);
+  Tensor bias_tensor =
+      makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
+  Tensor output_tensor =
+      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+      18, 2, 5, // first batch, left
+      18, 2, 5, // first batch, right
+      17, 4, 3, // second batch, left
+      37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
 TEST(Conv2DTest, SInt16)
 {
   Shape input_shape{1, 4, 3, 2};
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
index 1957f3c..9211331 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
@@ -111,7 +111,17 @@ void DepthwiseConv2D::execute() const
       }
       throw std::runtime_error("Unsupported type.");
     case DataType::U8:
-      evalQuantized();
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(3)));
+        evalQuantizedPerChannel();
+      }
       break;
     case DataType::S16:
       evalQuantizedS16();
@@ -144,6 +154,97 @@ void DepthwiseConv2D::evalFloat() const
       getTensorShape(output()), getTensorData<float>(output()));
 }
 
+void DepthwiseConv2D::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+  const int32_t depth_multiplier = _params.depth_multiplier;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  const std::vector<double> effective_output_scales =
+      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
+      quantizeMultipliers(effective_output_scales);
+  BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          for (int m = 0; m < depth_multiplier; ++m)
+          {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - _padding_width;
+            const int in_y_origin = (out_y * stride_height) - _padding_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+                if (is_point_inside_image)
+                {
+                  int32 input_val =
+                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
+                  int32 filter_val =
+                      filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += (filter_val - filter()->zero_points()[output_channel]) *
+                         (input_val - input()->zero_point());
+                }
+              }
+            }
+            if (bias_data)
+            {
+              acc += bias_data[output_channel];
+            }
+            int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
+            int output_shift = quant_multipliers[output_channel].shift;
+            int32_t scaled_acc =
+                tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+            scaled_acc += output()->zero_point();
+            scaled_acc = std::max(scaled_acc, activation_min);
+            scaled_acc = std::min(scaled_acc, activation_max);
+            output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
+                static_cast<uint8_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
 void DepthwiseConv2D::evalQuantized() const
 {
   const auto input_scale = static_cast<double>(input()->scale());
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
index 400bebe..6d700dd 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
@@ -42,6 +42,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
   void evalQuantizedS16() const;
 
 private:
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
index 0c76b58..f79e888 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
@@ -220,6 +220,79 @@ TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
+TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
+{
+  const int output_channels = 4;
+  Shape input_shape{1, 3, 2, 2};
+  Shape filter_shape{1, 2, 2, output_channels};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
+
+  std::vector<float> input_data{
+      1, 2, 7,  8,  //
+      3, 4, 9,  10, //
+      5, 6, 11, 12, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+      71, -34, 99,  -20, //
+      91, -26, 127, -4,  //
+  };
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(0, 16);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-9, 13));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-14, 10));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-11, 15));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-16, 12));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor filter_tensor =
+      makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 3, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data);
+  Tensor output_tensor =
+      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, output_quant_param.first));
+}
+
 TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
 {
   Shape input_shape{1, 4, 2, 2};
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
index b0ee905..491ae51 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
@@ -93,7 +93,17 @@ void TransposeConv::execute() const
       evalFloat();
       break;
     case DataType::U8:
-      evalQuantized();
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(0)));
+        evalQuantizedPerChannel();
+      }
       break;
     case DataType::S16:
       evalQuantizedS16();
@@ -147,6 +157,98 @@ void TransposeConv::evalQuantized() const
                                        getTensorData<int32_t>(_scratch_tensor.get()));
 }
 
+void TransposeConv::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+  auto *scratch_data = getTensorData<int32_t>(_scratch_tensor.get());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
+
+  std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int32_t));
+
+  BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t in_y = 0; in_y < input_height; ++in_y)
+    {
+      for (int32_t in_x = 0; in_x < input_width; ++in_x)
+      {
+        for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+        {
+          const int32_t out_y_origin = in_y * stride_height - _padding_height;
+          const int32_t out_x_origin = in_x * stride_width - _padding_width;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t out_x = out_x_origin + filter_x;
+              const int32_t out_y = out_y_origin + filter_y;
+              if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
+              {
+                for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+                {
+                  const uint8_t input_val =
+                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const uint8_t filter_val =
+                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
+                      static_cast<int32_t>(input_val - input()->zero_point()) *
+                      static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+              acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
+
+          scaled_acc += output()->zero_point();
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
 void TransposeConv::evalQuantizedS16() const
 {
   const auto *input_data = getTensorData<int16_t>(input());
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.h b/compiler/luci-interpreter/src/kernels/TransposeConv.h
index f51e169..2e0beec 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.h
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.h
@@ -47,6 +47,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
   void evalQuantizedS16() const;
 
 private:
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 8564de0..b1309c1 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -154,6 +154,65 @@ TEST(TransposeConvTest, UInt8)
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
+TEST(TransposeConvTest, UInt8_CWQ)
+{
+  const int32_t output_channels = 2;
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+  std::vector<int32_t> output_shape_data{1, 5, 5, 2};
+  std::vector<float> ref_output_data{
+      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  // Choose quantization parameters carefully.
+  auto input_quant = quantizationParams<uint8_t>(-8.0, 7.9375);  // s = 1 / 16, zp = 128
+  auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 17));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 18));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first,
+                                                      input_quant.second, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::U8>({output_channels, 3, 3, 1}, filter_scales,
+                                                       filter_zerops, 0, filter_data);
+  Tensor bias_tensor =
+      makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
+  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
 TEST(TransposeConvTest, SInt16)
 {
   std::vector<float> input_data{1, 2, 3, 4};
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index c52d99e..09e9235 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -57,8 +57,12 @@ const void *getNodeData(const luci::CircleConst *node, size_t *data_size)
       return getNodeDataImpl<DataType::U8>(node, data_size);
     case DataType::FLOAT32:
       return getNodeDataImpl<DataType::FLOAT32>(node, data_size);
+    case DataType::S16:
+      return getNodeDataImpl<DataType::S16>(node, data_size);
     case DataType::S32:
       return getNodeDataImpl<DataType::S32>(node, data_size);
+    case DataType::S64:
+      return getNodeDataImpl<DataType::S64>(node, data_size);
     default:
       throw std::runtime_error("Unsupported type.");
   }
diff --git a/compiler/luci/export/src/CircleExporterImpl.cpp b/compiler/luci/export/src/CircleExporterImpl.cpp
index 860cebf..df75427 100644
--- a/compiler/luci/export/src/CircleExporterImpl.cpp
+++ b/compiler/luci/export/src/CircleExporterImpl.cpp
@@ -16,7 +16,6 @@
 
 #include "CircleExporterImpl.h"
 #include "Optimize.h"
-#include "TypeBridge.h"
 #include "CircleTensorExporter.h"
 #include "CircleOperationExporter.h"
 #include "CircleExporterUtils.h"
@@ -150,9 +149,6 @@ void CircleExporterImpl::exportGraph(loco::Graph *graph)
   // do graph optimization
   optimize(graph);
 
-  // copy shape/dtype inference data to CircleNode
-  copy_shape_dtype(graph);
-
   _builder.Clear();
 
   SerializedModelData md;
@@ -223,9 +219,6 @@ void CircleExporterImpl::exportModule(Module *module)
 
     optimize(graph);
 
-    // copy shape/dtype inference data to CircleNode
-    copy_shape_dtype(graph);
-
     SerializedGraphData gd;
 
     // set Subgraph name
diff --git a/compiler/luci/export/src/CircleExporterUtils.cpp b/compiler/luci/export/src/CircleExporterUtils.cpp
index 1fdb40e..3715513 100644
--- a/compiler/luci/export/src/CircleExporterUtils.cpp
+++ b/compiler/luci/export/src/CircleExporterUtils.cpp
@@ -87,6 +87,22 @@ circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode)
   }
 }
 
+circle::FullyConnectedOptionsWeightsFormat
+to_circle_weightsformat(luci::CircleFullyConnected::WeightsFormat format)
+{
+  switch (format)
+  {
+    case luci::CircleFullyConnected::WeightsFormat::DEFAULT:
+      return circle::FullyConnectedOptionsWeightsFormat_DEFAULT;
+    case luci::CircleFullyConnected::WeightsFormat::SHUFFLED4x16INT8:
+      return circle::FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8;
+    case luci::CircleFullyConnected::WeightsFormat::SHUFFLED16x1FLOAT32:
+      return circle::FullyConnectedOptionsWeightsFormat_SHUFFLED16x1FLOAT32;
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::WeightsFormat", oops::to_uint32(format));
+  }
+}
+
 circle::DimensionType to_circle_dimensiontype(luci::DimensionType type)
 {
   switch (type)
diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h
index 7857213..95310b3 100644
--- a/compiler/luci/export/src/CircleExporterUtils.h
+++ b/compiler/luci/export/src/CircleExporterUtils.h
@@ -32,6 +32,8 @@ namespace luci
 circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func);
 circle::TensorType to_circle_tensortype(loco::DataType type);
 circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode);
+circle::FullyConnectedOptionsWeightsFormat
+to_circle_weightsformat(luci::CircleFullyConnected::WeightsFormat format);
 circle::DimensionType to_circle_dimensiontype(luci::DimensionType type);
 flatbuffers::Offset<void> to_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
                                                         const SparseIndexVector &sparse_idx_vec);
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index c937109..4343cf3 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -21,7 +21,6 @@
 #include <luci/IR/CircleNode.h>
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Service/CircleShapeInference.h>
 #include <luci/UserSettings.h>
 #include <luci/Log.h>
 
@@ -930,7 +929,8 @@ void OperationExporter::visit(luci::CircleFullyConnected *node)
 {
   export_simple(
       node, circle::BuiltinOperator_FULLY_CONNECTED, circle::BuiltinOptions_FullyConnectedOptions,
-      CreateFullyConnectedOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction()))
+      CreateFullyConnectedOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
+                                  to_circle_weightsformat(node->weights_format()))
           .Union());
 }
 
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 1429d28..9bdfa00 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -111,10 +111,10 @@ void allocateCircleTensorInfo(CircleNode *node, CircleTensorContext &ctx)
   CircleTensoInfo tensor_info;
 
   tensor_info.name(tensor_name);
-  tensor_info.dtype(to_circle_tensortype(luci::node_dtype(node)));
+  tensor_info.dtype(to_circle_tensortype(node->dtype()));
   tensor_info.shape_signature(node->shape_signature());
   if (node->shape_status() == ShapeStatus::VALID)
-    tensor_info.shape(to_shape_description(luci::node_shape(node)));
+    tensor_info.shape(to_shape_description(node));
   tensor_info.shape_status(node->shape_status());
 
   tensor_info.content(dynamic_cast<luci::CircleConst *>(node));
@@ -243,6 +243,9 @@ flatbuffers::Offset<Vector<int32_t>> encodeShape(FlatBufferBuilder &builder,
 flatbuffers::Offset<Vector<int32_t>> encodeShapeSignature(FlatBufferBuilder &builder,
                                                           const ShapeSignature &shape_signature)
 {
+  if (shape_signature.rank() == 0)
+    return 0;
+
   return builder.CreateVector(shape_signature.as_vector());
 }
 
diff --git a/compiler/luci/export/src/Optimize.cpp b/compiler/luci/export/src/Optimize.cpp
index 6fa50b5..036a4a2 100644
--- a/compiler/luci/export/src/Optimize.cpp
+++ b/compiler/luci/export/src/Optimize.cpp
@@ -18,6 +18,7 @@
 #include "ProgressReporter.h"
 
 #include <luci/Pass/ShapeInferencePass.h>
+#include <luci/Pass/ShapeSignatureInferencePass.h>
 #include <luci/Pass/TypeInferencePass.h>
 
 #include <logo/Phase.h>
@@ -34,6 +35,7 @@ void optimize(loco::Graph *g)
     // prepare type and shape before optimization
     phase.emplace_back(std::make_unique<TypeInferencePass>());
     phase.emplace_back(std::make_unique<ShapeInferencePass>());
+    phase.emplace_back(std::make_unique<ShapeSignatureInferencePass>());
 
     // TODO add more optimization passes (with a knob)
   }
diff --git a/compiler/luci/export/src/SerializedData.h b/compiler/luci/export/src/SerializedData.h
index 46b1ac2..c41f50e 100644
--- a/compiler/luci/export/src/SerializedData.h
+++ b/compiler/luci/export/src/SerializedData.h
@@ -64,7 +64,7 @@ namespace luci
 {
 
 /**
- * @breif Record the information of T/F Lite SubGraph and its mapping to loco
+ * @brief Record the information of T/F Lite SubGraph and its mapping to loco
  */
 struct SubGraphContext
 {
diff --git a/compiler/luci/import/include/luci/Import/CircleReader.h b/compiler/luci/import/include/luci/Import/CircleReader.h
index 8636b1d..8e210dd 100644
--- a/compiler/luci/import/include/luci/Import/CircleReader.h
+++ b/compiler/luci/import/include/luci/Import/CircleReader.h
@@ -46,6 +46,8 @@ loco::DataType luci_datatype(circle::TensorType type);
 FusedActFunc luci_actfunc(const circle::ActivationFunctionType type);
 Padding luci_padding(const circle::Padding padding);
 MirrorPadMode luci_mirrorpad_mode(const circle::MirrorPadMode mode);
+luci::CircleFullyConnected::WeightsFormat
+luci_weights_format(const circle::FullyConnectedOptionsWeightsFormat weights_format);
 std::unique_ptr<CircleQuantParam>
 luci_quantparam(const circle::QuantizationParametersT *quantization);
 
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index 068de52..b33c920 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -151,6 +151,22 @@ MirrorPadMode luci_mirrorpad_mode(const circle::MirrorPadMode mode)
   return MirrorPadMode::UNDEFINED;
 }
 
+luci::CircleFullyConnected::WeightsFormat
+luci_weights_format(const circle::FullyConnectedOptionsWeightsFormat weights_format)
+{
+  switch (weights_format)
+  {
+    case circle::FullyConnectedOptionsWeightsFormat_DEFAULT:
+      return luci::CircleFullyConnected::WeightsFormat::DEFAULT;
+    case circle::FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
+      return luci::CircleFullyConnected::WeightsFormat::SHUFFLED4x16INT8;
+    case circle::FullyConnectedOptionsWeightsFormat_SHUFFLED16x1FLOAT32:
+      return luci::CircleFullyConnected::WeightsFormat::SHUFFLED16x1FLOAT32;
+    default:
+      throw std::runtime_error("Invalid FullyConnectedOptionsWeightsFormat");
+  }
+}
+
 DimensionType luci_dim_type(const circle::DimensionType dim_type)
 {
   switch (dim_type)
diff --git a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
index 65a863b..17293ad 100644
--- a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
@@ -53,12 +53,7 @@ CircleNode *CircleFullyConnectedGraphBuilder::build_node(const circle::OperatorT
 
   const auto *options = op.builtin_options.AsFullyConnectedOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
-  if (options->weights_format != circle::FullyConnectedOptionsWeightsFormat_DEFAULT)
-  {
-    throw oops::UserExn(
-        "Unsupported weights format",
-        circle::EnumNameFullyConnectedOptionsWeightsFormat(options->weights_format));
-  }
+  node->weights_format(luci_weights_format(options->weights_format));
 
   return node;
 }
diff --git a/compiler/luci/lang/include/luci/IR/AttrDilation.h b/compiler/luci/lang/include/luci/IR/AttrDilation.h
index c2b28d7..ed82325 100644
--- a/compiler/luci/lang/include/luci/IR/AttrDilation.h
+++ b/compiler/luci/lang/include/luci/IR/AttrDilation.h
@@ -27,15 +27,17 @@ class Dilation final
 public:
   Dilation() : _w(1), _h(1) {}
 
-  int32_t w() const { return _w; }
-  void w(int32_t w) { _w = w; }
+  uint32_t w() const { return _w; }
+  void w(uint32_t w) { _w = w; }
+  void w(int32_t w);
 
-  int32_t h() const { return _h; }
-  void h(int32_t h) { _h = h; }
+  uint32_t h() const { return _h; }
+  void h(uint32_t h) { _h = h; }
+  void h(int32_t h);
 
 private:
-  int32_t _w;
-  int32_t _h;
+  uint32_t _w;
+  uint32_t _h;
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/AttrFilter.h b/compiler/luci/lang/include/luci/IR/AttrFilter.h
index 7909fa5..af9d751 100644
--- a/compiler/luci/lang/include/luci/IR/AttrFilter.h
+++ b/compiler/luci/lang/include/luci/IR/AttrFilter.h
@@ -27,15 +27,17 @@ class Filter final
 public:
   Filter() : _w(1), _h(1) {}
 
-  int32_t w() const { return _w; }
-  void w(int32_t w) { _w = w; }
+  uint32_t w() const { return _w; }
+  void w(uint32_t w) { _w = w; }
+  void w(int32_t w);
 
-  int32_t h() const { return _h; }
-  void h(int32_t h) { _h = h; }
+  uint32_t h() const { return _h; }
+  void h(uint32_t h) { _h = h; }
+  void h(int32_t h);
 
 private:
-  int32_t _w;
-  int32_t _h;
+  uint32_t _w;
+  uint32_t _h;
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/AttrStride.h b/compiler/luci/lang/include/luci/IR/AttrStride.h
index 654967d..6be6979 100644
--- a/compiler/luci/lang/include/luci/IR/AttrStride.h
+++ b/compiler/luci/lang/include/luci/IR/AttrStride.h
@@ -27,15 +27,17 @@ class Stride final
 public:
   Stride() : _w(1), _h(1) {}
 
-  int32_t w() const { return _w; }
-  void w(int32_t w) { _w = w; }
+  uint32_t w() const { return _w; }
+  void w(uint32_t w) { _w = w; }
+  void w(int32_t w);
 
-  int32_t h() const { return _h; }
-  void h(int32_t h) { _h = h; }
+  uint32_t h() const { return _h; }
+  void h(uint32_t h) { _h = h; }
+  void h(int32_t h);
 
 private:
-  int32_t _w;
-  int32_t _h;
+  uint32_t _w;
+  uint32_t _h;
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h b/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h
index 970f1b5..18a2604 100644
--- a/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h
+++ b/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h
@@ -46,6 +46,8 @@ private:
   std::vector<int32_t> _shape_signature{};
 };
 
+bool operator==(const ShapeSignature &lhs, const ShapeSignature &rhs);
+
 } // namespace luci
 
 #endif // __LUCI_IR_SHAPE_SIGNATURE_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
index d78f394..952befc 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
@@ -35,6 +35,16 @@ class CircleFullyConnected final
       public LuciNodeMixin<LuciNodeTrait::Bias>
 {
 public:
+  enum class WeightsFormat
+  {
+    UNDEFINED, // This is not defined by Circle. This was added to prevent programming error.
+
+    DEFAULT,
+    SHUFFLED4x16INT8,
+    SHUFFLED16x1FLOAT32,
+  };
+
+public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
@@ -43,6 +53,13 @@ public:
 
   loco::Node *bias(void) const override { return at(2)->node(); }
   void bias(loco::Node *node) override { at(2)->node(node); }
+
+public:
+  WeightsFormat weights_format(void) const { return _weights_format; }
+  void weights_format(WeightsFormat weights_format) { _weights_format = weights_format; }
+
+private:
+  WeightsFormat _weights_format{WeightsFormat::DEFAULT};
 };
 
 } // namespace luci
diff --git a/tools/nnapi_quickcheck/inc/memory.h b/compiler/luci/lang/src/AttrDilation.cpp
similarity index 64%
rename from tools/nnapi_quickcheck/inc/memory.h
rename to compiler/luci/lang/src/AttrDilation.cpp
index 3f1bca8..a9f4795 100644
--- a/tools/nnapi_quickcheck/inc/memory.h
+++ b/compiler/luci/lang/src/AttrDilation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,21 +14,23 @@
  * limitations under the License.
  */
 
-#ifndef __MEMORY_H__
-#define __MEMORY_H__
+#include "luci/IR/AttrDilation.h"
 
-#include <cstdlib>
+#include <cassert>
 
-template <typename T> inline T *make_alloc(void)
+namespace luci
 {
-  auto ptr = malloc(sizeof(T));
 
-  if (ptr == nullptr)
-  {
-    throw std::bad_alloc{};
-  }
+void Dilation::w(int32_t w)
+{
+  assert(w >= 0);
+  _w = static_cast<uint32_t>(w);
+}
 
-  return reinterpret_cast<T *>(ptr);
+void Dilation::h(int32_t h)
+{
+  assert(h >= 0);
+  _h = static_cast<uint32_t>(h);
 }
 
-#endif // __MEMORY_H__
+} // namespace luci
diff --git a/compiler/luci/lang/src/AttrDilation.test.cpp b/compiler/luci/lang/src/AttrDilation.test.cpp
new file mode 100644
index 0000000..3e46589
--- /dev/null
+++ b/compiler/luci/lang/src/AttrDilation.test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/AttrDilation.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleAttrDilationTest, set)
+{
+  auto d = luci::Dilation();
+
+  d.h(10u);
+  d.w(10u);
+
+  ASSERT_EQ(d.h(), 10u);
+  ASSERT_EQ(d.w(), 10u);
+
+  d.h(10); // int32_t
+  d.w(10);
+
+  ASSERT_EQ(d.h(), 10u);
+  ASSERT_EQ(d.w(), 10u);
+}
diff --git a/tools/nnapi_quickcheck/lib/env.test.cpp b/compiler/luci/lang/src/AttrFilter.cpp
similarity index 55%
rename from tools/nnapi_quickcheck/lib/env.test.cpp
rename to compiler/luci/lang/src/AttrFilter.cpp
index dd9ac8b..9c571e7 100644
--- a/tools/nnapi_quickcheck/lib/env.test.cpp
+++ b/compiler/luci/lang/src/AttrFilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
-#include "env.h"
+#include "luci/IR/AttrFilter.h"
 
-#include <string>
-
-#include <cstdlib>
 #include <cassert>
 
-inline void ensure(int err) { assert(err == 0); }
-
-int main(int argc, char **argv)
+namespace luci
 {
-  const std::string key{"TEST"};
-  const int num{3};
-
-  const auto str = std::to_string(num);
-
-  ensure(unsetenv(key.c_str()));
-  ensure(setenv(key.c_str(), str.c_str(), 0));
-
-  int value = 0;
-
-  assert(value != num);
 
-  IntVar buffer(key, value);
-
-  assert(buffer() == num);
+void Filter::w(int32_t w)
+{
+  assert(w >= 0);
+  _w = static_cast<uint32_t>(w);
+}
 
-  return 0;
+void Filter::h(int32_t h)
+{
+  assert(h >= 0);
+  _h = static_cast<uint32_t>(h);
 }
+
+} // namespace luci
diff --git a/compiler/luci/lang/src/AttrFilter.test.cpp b/compiler/luci/lang/src/AttrFilter.test.cpp
new file mode 100644
index 0000000..06dbcac
--- /dev/null
+++ b/compiler/luci/lang/src/AttrFilter.test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/AttrFilter.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleAttrFilterTest, set)
+{
+  auto f = luci::Filter();
+
+  f.h(10u);
+  f.w(10u);
+
+  ASSERT_EQ(f.h(), 10u);
+  ASSERT_EQ(f.w(), 10u);
+
+  f.h(10); // int32_t
+  f.w(10);
+
+  ASSERT_EQ(f.h(), 10u);
+  ASSERT_EQ(f.w(), 10u);
+}
diff --git a/compiler/luci/lang/src/AttrStride.cpp b/compiler/luci/lang/src/AttrStride.cpp
new file mode 100644
index 0000000..9720d12
--- /dev/null
+++ b/compiler/luci/lang/src/AttrStride.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/AttrStride.h"
+
+#include <cassert>
+
+namespace luci
+{
+
+void Stride::w(int32_t w)
+{
+  assert(w >= 0);
+  _w = static_cast<uint32_t>(w);
+}
+
+void Stride::h(int32_t h)
+{
+  assert(h >= 0);
+  _h = static_cast<uint32_t>(h);
+}
+
+} // namespace luci
diff --git a/compiler/luci/lang/src/AttrStride.test.cpp b/compiler/luci/lang/src/AttrStride.test.cpp
new file mode 100644
index 0000000..e91365b
--- /dev/null
+++ b/compiler/luci/lang/src/AttrStride.test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/AttrStride.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleAttrStrideTest, set)
+{
+  auto s = luci::Stride();
+
+  s.h(10u);
+  s.w(10u);
+
+  ASSERT_EQ(s.h(), 10u);
+  ASSERT_EQ(s.w(), 10u);
+
+  s.h(10); // int32_t
+  s.w(10);
+
+  ASSERT_EQ(s.h(), 10u);
+  ASSERT_EQ(s.w(), 10u);
+}
diff --git a/compiler/luci/lang/src/CircleShapeSignature.cpp b/compiler/luci/lang/src/CircleShapeSignature.cpp
new file mode 100644
index 0000000..9700002
--- /dev/null
+++ b/compiler/luci/lang/src/CircleShapeSignature.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/CircleShapeSignature.h"
+
+namespace luci
+{
+
+bool operator==(const ShapeSignature &lhs, const ShapeSignature &rhs)
+{
+  if (lhs.rank() != rhs.rank())
+    return false;
+
+  for (uint32_t i = 0; i < lhs.rank(); ++i)
+    if (lhs.dim(i) != rhs.dim(i))
+      return false;
+
+  return true;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index db5bdb5..906760e 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -19,6 +19,8 @@
 
 #include <loco.h>
 
+#include <luci/IR/Module.h>
+
 #include <string>
 #include <vector>
 
@@ -47,6 +49,10 @@ public:
       FusePreActivationBatchNorm,
       MakeBatchNormGammaPositive,
       FuseActivationFunction,
+      ShuffleWeightTo16x1Float32,
+      RemoveRedundantTranspose,
+      ReplaceMulAddWithDepthwiseConv,
+      SubstitutePackToReshape,
     };
 
     enum AlgorithmParameters
@@ -77,6 +83,8 @@ public:
   Options *options(void);
 
 public:
+  void optimize(luci::Module *) const;
+
   void optimize(loco::Graph *) const;
 
   void quantize(loco::Graph *) const;
diff --git a/compiler/luci/pass/include/luci/ModulePass.h b/compiler/luci/pass/include/luci/ModulePass.h
new file mode 100644
index 0000000..1835f6e
--- /dev/null
+++ b/compiler/luci/pass/include/luci/ModulePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MODULE_PASS_H__
+#define __MODULE_PASS_H__
+
+#include <loco.h>
+#include <logo/Pass.h>
+
+#include <luci/IR/Module.h>
+
+namespace luci
+{
+
+class Pass : public logo::Pass
+{
+public:
+  // Run module pass and return false if there was nothing changed
+  virtual bool run(luci::Module *) = 0;
+};
+
+} // namespace luci
+
+#endif // __MODULE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/CircleTypeInferencePass.h b/compiler/luci/pass/include/luci/Pass/CircleTypeInferencePass.h
new file mode 100644
index 0000000..379b44c
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/CircleTypeInferencePass.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_TYPE_INFERENCE_PASS_H__
+#define __LUCI_CIRCLE_TYPE_INFERENCE_PASS_H__
+
+#include <loco.h>
+
+#include <luci/ModulePass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Pass to infer type of circle nodes
+ */
+class CircleTypeInferencePass : public luci::Pass
+{
+public:
+  virtual const char *name(void) const { return "luci::CircleTypeInferencePass"; }
+
+public:
+  bool run(luci::Module *m);
+  bool run(loco::Graph *g);
+};
+
+} // namespace luci
+
+#endif //__LUCI_CIRCLE_TYPE_INFERENCE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseBCQPass.h b/compiler/luci/pass/include/luci/Pass/FuseBCQPass.h
index 4404a9f..912ad42 100644
--- a/compiler/luci/pass/include/luci/Pass/FuseBCQPass.h
+++ b/compiler/luci/pass/include/luci/Pass/FuseBCQPass.h
@@ -17,7 +17,7 @@
 #ifndef __LUCI_FUSE_BCQ_PASS_H__
 #define __LUCI_FUSE_BCQ_PASS_H__
 
-#include <logo/Pass.h>
+#include <luci/ModulePass.h>
 
 namespace luci
 {
@@ -26,10 +26,11 @@ namespace luci
  * @brief  Class to fuse certain pattern of subgraph into CircleBCQFullyConnected or CircleBCQGather
  *
  */
-struct FuseBCQPass final : public logo::Pass
+struct FuseBCQPass final : public luci::Pass
 {
   const char *name(void) const final { return "luci::FuseBCQPass"; }
 
+  bool run(luci::Module *m) final;
   bool run(loco::Graph *g) final;
 };
 
diff --git a/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h b/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h
new file mode 100644
index 0000000..c0ebc4e
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
+#define __LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
+
+#include <loco.h>
+
+#include <luci/ModulePass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Pass to copy shape/dtype of loco to circle node
+ *
+ * CAUTION : This pass will be removed after refactoring is finished
+ */
+class MigrateLegacyShapeDtypePass : public luci::Pass
+{
+public:
+  virtual const char *name(void) const { return "luci::MigrateLegacyShapeDtypePass"; }
+
+public:
+  bool run(luci::Module *m);
+  bool run(loco::Graph *graph);
+};
+
+} // namespace luci
+
+#endif //__LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/PropagateQuantParamPass.h b/compiler/luci/pass/include/luci/Pass/PropagateQuantParamPass.h
new file mode 100644
index 0000000..7e0c44b
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/PropagateQuantParamPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PROPAGATE_QUANT_PARAM_PASS_H__
+#define __LUCI_PROPAGATE_QUANT_PARAM_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to propagate quantization parameters of an operator's output to input
+ */
+struct PropagateQuantParamPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::PropagateQuantParamPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_PROPAGATE_QUANT_PARAM_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveRedundantTransposePass.h b/compiler/luci/pass/include/luci/Pass/RemoveRedundantTransposePass.h
new file mode 100644
index 0000000..ca20da5
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveRedundantTransposePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_REDUNDANT_TRANSPOSE_H__
+#define __LUCI_REMOVE_REDUNDANT_TRANSPOSE_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief fuse or remove subsequent Transpose operators
+ */
+struct RemoveRedundantTransposePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveRedundantTransposePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_REDUNDANT_TRANSPOSE_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h b/compiler/luci/pass/include/luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h
new file mode 100644
index 0000000..5dbcc8f
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REPLACE_MUL_ADD_WITH_DEPTHWISE_CONV_PASS_H__
+#define __LUCI_REPLACE_MUL_ADD_WITH_DEPTHWISE_CONV_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to replace channel-wise mul/add with CircleDepthwiseConv2D
+ */
+struct ReplaceMulAddWithDepthwiseConvPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ReplaceMulAddWithDepthwiseConvPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REPLACE_MUL_ADD_WITH_DEPTHWISE_CONV_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h b/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h
index 86bb2ab..e21ab4c 100644
--- a/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h
+++ b/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h
@@ -19,7 +19,7 @@
 
 #include <loco.h>
 
-#include <logo/Pass.h>
+#include <luci/ModulePass.h>
 
 namespace luci
 {
@@ -27,12 +27,13 @@ namespace luci
 /**
  * @brief Pass to infer shape of nodes
  */
-class ShapeInferencePass : public logo::Pass
+class ShapeInferencePass : public luci::Pass
 {
 public:
   virtual const char *name(void) const { return "luci::ShapeInferencePass"; }
 
 public:
+  bool run(luci::Module *m);
   bool run(loco::Graph *graph);
 };
 
diff --git a/compiler/luci/pass/include/luci/Pass/ShapeSignatureInferencePass.h b/compiler/luci/pass/include/luci/Pass/ShapeSignatureInferencePass.h
new file mode 100644
index 0000000..2c6ffcf
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ShapeSignatureInferencePass.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
+#define __LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
+
+#include <loco.h>
+
+#include <luci/ModulePass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Pass to infer shape_signature of nodes
+ */
+class ShapeSignatureInferencePass : public luci::Pass
+{
+public:
+  virtual const char *name(void) const { return "luci::ShapeSignatureInferencePass"; }
+
+public:
+  bool run(luci::Module *m);
+  bool run(loco::Graph *graph);
+};
+
+} // namespace luci
+
+#endif //__LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ShuffleWeightTo16x1Float32Pass.h b/compiler/luci/pass/include/luci/Pass/ShuffleWeightTo16x1Float32Pass.h
new file mode 100644
index 0000000..3d84f51
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ShuffleWeightTo16x1Float32Pass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SHUFFLE_WEIGHT_TO_16X1_FLOAT32_PASS_H__
+#define __LUCI_SHUFFLE_WEIGHT_TO_16X1_FLOAT32_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Class to convert weight format of FullyConnected to SHUFFLED16x1FLOAT32
+ */
+struct ShuffleWeightTo16x1Float32Pass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ShuffleWeightTo16x1Float32Pass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_SHUFFLE_WEIGHT_TO_16X1_FLOAT32_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/SubstitutePackToReshapePass.h b/compiler/luci/pass/include/luci/Pass/SubstitutePackToReshapePass.h
new file mode 100644
index 0000000..36d13f1
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/SubstitutePackToReshapePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SUBSTITUTE_PACK_TO_RESHAPE_PASS_H__
+#define __LUCI_SUBSTITUTE_PACK_TO_RESHAPE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Substitute Pack with 1 input to single reshape node.
+ */
+struct SubstitutePackToReshapePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::SubstitutePackToReshapePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_SUBSTITUTE_PACK_TO_RESHAPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h b/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h
index c607ac6..9d964bd 100644
--- a/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h
+++ b/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h
@@ -20,7 +20,7 @@
 
 #include <loco.h>
 
-#include <logo/Pass.h>
+#include <luci/ModulePass.h>
 
 namespace luci
 {
@@ -28,12 +28,13 @@ namespace luci
 /**
  * @brief Pass to infer type of nodes
  */
-class TypeInferencePass : public logo::Pass
+class TypeInferencePass : public luci::Pass
 {
 public:
   virtual const char *name(void) const { return "luci::TypeInferencePass"; }
 
 public:
+  bool run(luci::Module *m);
   bool run(loco::Graph *graph);
 };
 
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 34f6473..cc9fe48 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -24,6 +24,9 @@
 #include "luci/Pass/FuseInstanceNormPass.h"
 #include "luci/Pass/FusePreActivationBatchNormPass.h"
 #include "luci/Pass/MakeBatchNormGammaPositivePass.h"
+#include "luci/Pass/PropagateQuantParamPass.h"
+#include "luci/Pass/RemoveRedundantTransposePass.h"
+#include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
 #include "luci/Pass/ResolveCustomOpAddPass.h"
 #include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
 #include "luci/Pass/ResolveCustomOpMatMulPass.h"
@@ -31,14 +34,21 @@
 #include "luci/Pass/QuantizeWithMinMaxPass.h"
 #include "luci/Pass/QuantizeDequantizeWeightsPass.h"
 #include "luci/Pass/SparsifyTensorPass.h"
+#include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
+#include "luci/Pass/SubstitutePackToReshapePass.h"
 // TODO add more passes
 
 #include "luci/Pass/ShapeInferencePass.h"
+#include "luci/Pass/ShapeSignatureInferencePass.h"
 #include "luci/Pass/TypeInferencePass.h"
 
+// Following passes will be removed after refactoring is finished
+#include "luci/Pass/MigrateLegacyShapeDtypePass.h"
+
 // logo passes
 #include <logo/RemoveDeadNodeWithQueryPass.h>
 
+#include "ModulePhase.h"
 #include "ProgressReporter.h"
 #include "CircleOptimizerUtils.h"
 
@@ -124,11 +134,44 @@ CircleOptimizer::Options *CircleOptimizer::options(void)
   return _options.get();
 }
 
+void CircleOptimizer::optimize(luci::Module *m) const
+{
+  luci::Phase phase;
+
+  // Following passes will be deprecated after refactoring is finished.
+  phase.emplace_back(std::make_unique<luci::MigrateLegacyShapeDtypePass>());
+
+  // Following passes are needed everytime when other passes create new node or modify some nodes.
+  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::ShapeSignatureInferencePass>());
+  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+
+  if (_options->query(Options::Algorithm::FuseBCQ))
+  {
+    phase.emplace_back(std::make_unique<FuseBCQPass>());
+  }
+
+  ModuleProgressReporter prog(m, logo::PhaseStrategy::Restart);
+  PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{m};
+  phase_runner.attach(&prog);
+  phase_runner.run(phase);
+}
+
 void CircleOptimizer::optimize(loco::Graph *g) const
 {
   logo::Phase phase;
 
   /* TRANSFORM DECLARATION BEGIN */
+  phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
+
+  // Following passes will be deprecated after refactoring is finished.
+  phase.emplace_back(std::make_unique<luci::MigrateLegacyShapeDtypePass>());
+
+  // Following passes are needed everytime when other passes create new node or modify some nodes.
+  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::ShapeSignatureInferencePass>());
+
   if (_options->query(Options::Algorithm::ResolveCustomOpAdd))
   {
     phase.emplace_back(std::make_unique<luci::ResolveCustomOpAddPass>());
@@ -145,10 +188,6 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseInstanceNormPass>());
   }
-  if (_options->query(Options::Algorithm::FuseBCQ))
-  {
-    phase.emplace_back(std::make_unique<FuseBCQPass>());
-  }
   if (_options->query(Options::Algorithm::FuseBatchNormWithTConv))
   {
     phase.emplace_back(std::make_unique<FuseBatchNormWithTConvPass>());
@@ -173,15 +212,27 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::MakeBatchNormGammaPositivePass>());
   }
+  if (_options->query(Options::Algorithm::ShuffleWeightTo16x1Float32))
+  {
+    phase.emplace_back(std::make_unique<luci::ShuffleWeightTo16x1Float32Pass>());
+  }
+  if (_options->query(Options::Algorithm::RemoveRedundantTranspose))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveRedundantTransposePass>());
+  }
+  if (_options->query(Options::Algorithm::ReplaceMulAddWithDepthwiseConv))
+  {
+    phase.emplace_back(std::make_unique<luci::ReplaceMulAddWithDepthwiseConvPass>());
+  }
+  if (_options->query(Options::Algorithm::SubstitutePackToReshape))
+  {
+    phase.emplace_back(std::make_unique<luci::SubstitutePackToReshapePass>());
+  }
 
-  // Shape inference is needed for added nodes doing above transformations
-  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
-  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
-  phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
   /* TRANSFORM DECLARATION END */
 
-  ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
-  logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
+  ProgressReporter prog(g, logo::PhaseStrategy::Restart);
+  logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
   phase_runner.attach(&prog);
   phase_runner.run(phase);
 }
@@ -258,6 +309,20 @@ void CircleOptimizer::quantize(loco::Graph *g) const
     luci::QuantizeWithMinMaxPass quantizer(str_to_dtype(input_dtype), str_to_dtype(output_dtype),
                                            str_to_granularity(granularity));
     quantizer.run(g);
+
+    // Post-quantization optimizations
+    logo::Phase phase;
+
+    phase.emplace_back(std::make_unique<luci::PropagateQuantParamPass>());
+
+    phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
+    phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
+
+    ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
+    logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
+    phase_runner.attach(&prog);
+    phase_runner.run(phase);
   }
 
   // Requantize
diff --git a/compiler/luci/pass/src/CircleTypeInferencePass.cpp b/compiler/luci/pass/src/CircleTypeInferencePass.cpp
new file mode 100644
index 0000000..67bd253
--- /dev/null
+++ b/compiler/luci/pass/src/CircleTypeInferencePass.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/CircleTypeInferencePass.h"
+
+#include <luci/Service/CircleTypeInference.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleTypeInferencePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
+bool CircleTypeInferencePass::run(loco::Graph *g)
+{
+  luci::tinf::Rule type_infer_rule;
+  bool changed = false;
+
+  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
+  {
+    loco::DataType dtype;
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+
+    if (type_infer_rule.infer(circle_node, dtype) && circle_node->dtype() != dtype)
+    {
+      circle_node->dtype(dtype);
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index ebf2877..c0583d8 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -25,6 +25,85 @@
 namespace
 {
 
+bool is_fusable_const(luci::CircleConst *before, luci::CircleConst *after, bool do_w_x)
+{
+  if (after->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  if (after->rank() != 2)
+    return false;
+
+  if (after->size<loco::DataType::FLOAT32>() != before->size<loco::DataType::FLOAT32>())
+    return false;
+
+  auto after_dim0 = after->dim(0).value();
+  auto after_dim1 = after->dim(1).value();
+
+  if (before->rank() == 2)
+  {
+    if (do_w_x)
+    {
+      // Check for [dim0, dim1] --> [dim0, dim1]
+      if (!(after->dim(0) == before->dim(0) && after->dim(1) == before->dim(1)))
+        return false;
+
+      for (uint32_t i = 0; i < after->size<loco::DataType::FLOAT32>(); ++i)
+        if (after->at<loco::DataType::FLOAT32>(i) != before->at<loco::DataType::FLOAT32>(i))
+          return false;
+    }
+    else
+    {
+      // Check for [dim0, dim1] --> [dim1, dim0]
+      if (!(after->dim(0) == before->dim(1) && after->dim(1) == before->dim(0)))
+        return false;
+
+      for (uint32_t i = 0; i < after_dim0; ++i)
+        for (uint32_t j = 0; j < after_dim1; ++j)
+          if (after->at<loco::DataType::FLOAT32>(i * after_dim1 + j) !=
+              before->at<loco::DataType::FLOAT32>(j * after_dim0 + i))
+            return false;
+    }
+
+    return true;
+  }
+  else if (before->rank() == 3)
+  {
+    if (do_w_x)
+    {
+      // This case is not found yet.
+      return false;
+    }
+    else
+    {
+      // When Einsum op is converted to FullyConnected, original rank can be 3.
+      auto before_dim0 = before->dim(0).value();
+      auto before_dim1 = before->dim(1).value();
+      auto before_dim2 = before->dim(2).value();
+
+      // Check if [dim0, dim1, dim2] --> [dim2, dim0 * dim1] or
+      //          [dim0, dim1, dim2] --> [dim1 * dim2, dim0]
+      if ((after_dim0 == before_dim1 * before_dim2 && after_dim1 == before_dim0) ||
+          (after_dim0 == before_dim2 && after_dim1 == before_dim0 * before_dim1))
+      {
+        for (uint32_t i = 0; i < after_dim0; ++i)
+          for (uint32_t j = 0; j < after_dim1; ++j)
+            if (after->at<loco::DataType::FLOAT32>(i * after_dim1 + j) !=
+                before->at<loco::DataType::FLOAT32>(j * after_dim0 + i))
+              return false;
+      }
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace
+
+namespace
+{
+
 // V means the version of BCQ.
 template <int32_t V> class BCQFuser;
 
@@ -38,11 +117,9 @@ public:
   }
 
 public:
-  bool fuseBCQ(loco::Graph *g)
+  void register_bcq_info(loco::Graph *g)
   {
-
-    const auto output_nodes = loco::output_nodes(g);
-    for (auto node : output_nodes)
+    for (auto node : loco::output_nodes(g))
     {
       auto output_node = loco::must_cast<luci::CircleOutput *>(node);
 
@@ -61,28 +138,29 @@ public:
         add_BCQ_info_node(prefix, metadata_type, circle_node);
       }
     }
+  }
 
+  bool fuseBCQ(loco::Graph *g)
+  {
     if (!is_bcqinfo_valid())
       return false;
 
-    for (auto f : _fusable_op)
+    for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
     {
-      auto prefix = f.first;
-      luci::CircleNode *node = f.second;
-
-      if (!is_valid_prefix(prefix))
-        continue;
-
       // Fuse Gather to BCQGather
       if (auto gather = dynamic_cast<luci::CircleGather *>(node))
       {
         if (auto params = dynamic_cast<luci::CircleConst *>(gather->params()))
         {
+          auto prefix = get_prefix_of_const(params);
+          if (prefix == -1 || !is_valid_prefix(prefix))
+            continue;
+
           auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
 
           bcq_gather->op_version(1);
-          bcq_gather->input_scales(_alpha[prefix]);
-          bcq_gather->input_binary(_packed_binary_code[prefix]);
+          bcq_gather->input_scales(alpha(g, prefix));
+          bcq_gather->input_binary(packed_binary_code(g, prefix));
           bcq_gather->indices(gather->indices());
           bcq_gather->input_clusters(packed_clusters(g, prefix));
 
@@ -122,29 +200,20 @@ public:
         }
       }
 
-      // Einsum is unpacked to FullyConnected, Pack and Reshape
-      if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
-      {
-        node = dynamic_cast<luci::CircleNode *>(reshape->tensor());
-      }
-      if (auto pack = dynamic_cast<luci::CirclePack *>(node))
-      {
-        if (pack->values_count() == 1 && pack->rank() == 3)
-        {
-          node = dynamic_cast<luci::CircleNode *>(pack->values(0));
-        }
-      }
-
       // Fuse FullyConnected to BCQFullyConnected
       if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
       {
         if (auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights()))
         {
+          auto prefix = get_prefix_of_const(weights);
+          if (prefix == -1 || !is_valid_prefix(prefix))
+            continue;
+
           auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
 
           bcq_fc->op_version(1);
-          bcq_fc->weights_scales(_alpha[prefix]);
-          bcq_fc->weights_binary(_packed_binary_code[prefix]);
+          bcq_fc->weights_scales(alpha(g, prefix));
+          bcq_fc->weights_binary(packed_binary_code(g, prefix));
           bcq_fc->bias(fully_connected->bias());
           bcq_fc->weights_clusters(packed_clusters(g, prefix));
           bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
@@ -179,43 +248,69 @@ public:
           }
 
           // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
-          if (_do_w_x[prefix]->at<loco::DataType::BOOL>(0))
-          {
-            bcq_fc->weights_hidden_size(weights->dim(0).value());
-            bcq_fc->input(bcq_input);
-            loco::replace(fully_connected).with(bcq_fc);
-          }
-          else
-          {
-            bcq_fc->weights_hidden_size(weights->dim(1).value());
+          bcq_fc->weights_hidden_size(weights->dim(1).value());
 
-            auto perm = g->nodes()->create<luci::CircleConst>();
-            perm->dtype(loco::DataType::S32);
-            perm->size<loco::DataType::S32>(2);
-            perm->rank(1);
-            perm->dim(0) = 2;
-            perm->at<loco::DataType::S32>(0) = 1;
-            perm->at<loco::DataType::S32>(1) = 0;
-            perm->shape_status(luci::ShapeStatus::VALID);
+          auto perm = g->nodes()->create<luci::CircleConst>();
+          perm->dtype(loco::DataType::S32);
+          perm->size<loco::DataType::S32>(2);
+          perm->rank(1);
+          perm->dim(0) = 2;
+          perm->at<loco::DataType::S32>(0) = 1;
+          perm->at<loco::DataType::S32>(1) = 0;
+          perm->shape_status(luci::ShapeStatus::VALID);
 
-            auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
-            input_transpose->a(bcq_input);
-            input_transpose->perm(perm);
+          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+          input_transpose->a(bcq_input);
+          input_transpose->perm(perm);
 
-            bcq_fc->input(input_transpose);
+          bcq_fc->input(input_transpose);
 
-            auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
-            output_transpose->a(bcq_fc);
-            output_transpose->perm(perm);
+          auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+          output_transpose->a(bcq_fc);
+          output_transpose->perm(perm);
 
-            loco::replace(fully_connected).with(output_transpose);
-          }
+          loco::replace(fully_connected).with(output_transpose);
 
           return true;
         }
-        else
+        else if (auto weights_as_input =
+                     dynamic_cast<luci::CircleConst *>(fully_connected->input()))
         {
-          // TODO Is there any case that input() is constant, instead of weights()?
+          auto prefix = get_prefix_of_const(weights_as_input);
+          if (prefix == -1 || !is_valid_prefix(prefix))
+            continue;
+
+          assert(_do_w_x[prefix]->at<loco::DataType::BOOL>(0) == true);
+
+          auto perm = g->nodes()->create<luci::CircleConst>();
+          perm->dtype(loco::DataType::S32);
+          perm->size<loco::DataType::S32>(2);
+          perm->rank(1);
+          perm->dim(0) = 2;
+          perm->at<loco::DataType::S32>(0) = 1;
+          perm->at<loco::DataType::S32>(1) = 0;
+          perm->shape_status(luci::ShapeStatus::VALID);
+
+          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+          input_transpose->a(fully_connected->weights());
+          input_transpose->perm(perm);
+
+          auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+
+          assert(dynamic_cast<luci::CircleOutputExclude *>(fully_connected->bias()) != nullptr);
+
+          bcq_fc->op_version(1);
+          bcq_fc->weights_scales(alpha(g, prefix));
+          bcq_fc->weights_binary(packed_binary_code(g, prefix));
+          bcq_fc->bias(fully_connected->bias());
+          bcq_fc->weights_clusters(packed_clusters(g, prefix));
+          bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+
+          bcq_fc->weights_hidden_size(weights_as_input->dim(1).value());
+          bcq_fc->input(input_transpose);
+          loco::replace(fully_connected).with(bcq_fc);
+
+          return true;
         }
       }
     }
@@ -268,6 +363,19 @@ private:
       _dequant_weight[prefix] = const_node;
   }
 
+  int32_t get_prefix_of_const(luci::CircleConst *w_after)
+  {
+    for (auto n : _fusable_op)
+    {
+      auto prefix = n.first;
+      auto w_before = loco::must_cast<luci::CircleConst *>(n.second);
+      if (is_fusable_const(w_before, w_after, _do_w_x[prefix]->at<loco::DataType::BOOL>(0)))
+        return prefix;
+    }
+
+    return -1;
+  }
+
   bool is_bcqinfo_valid()
   {
     LOGGER(l);
@@ -332,6 +440,16 @@ private:
       }
     }
 
+    for (auto n : _fusable_op)
+    {
+      // fusable_op should be FLOAT32 type
+      if (n.second->dtype() != loco::DataType::FLOAT32)
+      {
+        WARN(l) << "FuseBCQPass : fusable_op has wrong type" << std::endl;
+        return false;
+      }
+    }
+
     // As dequant_weight is not used for fusing, skip validation.
 
     return true;
@@ -377,12 +495,50 @@ private:
       return false;
     }
 
+    if (_fusable_op.find(prefix) == _fusable_op.end())
+    {
+      WARN(l) << "fusable_op is not found" << std::endl;
+      return false;
+    }
+
     // As dequant_weight is not used for fusing, skip validation.
 
     return true;
   }
 
 private:
+  luci::CircleConst *alpha(loco::Graph *graph, int32_t prefix)
+  {
+    auto new_alpha = graph->nodes()->create<luci::CircleConst>();
+
+    new_alpha->dtype(loco::DataType::FLOAT32);
+    new_alpha->size<loco::DataType::FLOAT32>(_alpha[prefix]->size<loco::DataType::FLOAT32>());
+    new_alpha->rank(1);
+    new_alpha->dim(0) = _alpha[prefix]->dim(0);
+    for (uint32_t i = 0; i < _alpha[prefix]->size<loco::DataType::FLOAT32>(); ++i)
+      new_alpha->at<loco::DataType::FLOAT32>(i) = _alpha[prefix]->at<loco::DataType::FLOAT32>(i);
+    new_alpha->shape_status(luci::ShapeStatus::VALID);
+
+    return new_alpha;
+  }
+
+  luci::CircleConst *packed_binary_code(loco::Graph *graph, int32_t prefix)
+  {
+    auto new_beta = graph->nodes()->create<luci::CircleConst>();
+
+    new_beta->dtype(loco::DataType::S32);
+    new_beta->size<loco::DataType::S32>(_packed_binary_code[prefix]->size<loco::DataType::S32>());
+    new_beta->rank(2);
+    new_beta->dim(0) = _packed_binary_code[prefix]->dim(0);
+    new_beta->dim(1) = _packed_binary_code[prefix]->dim(1);
+    for (uint32_t i = 0; i < _packed_binary_code[prefix]->size<loco::DataType::S32>(); ++i)
+      new_beta->at<loco::DataType::S32>(i) =
+          _packed_binary_code[prefix]->at<loco::DataType::S32>(i);
+    new_beta->shape_status(luci::ShapeStatus::VALID);
+
+    return new_beta;
+  }
+
   luci::CircleConst *packed_clusters(loco::Graph *graph, int32_t prefix)
   {
     auto qbits_of_clusters = _qbits_of_clusters[prefix];
@@ -428,15 +584,17 @@ private:
 namespace luci
 {
 
-bool FuseBCQPass::run(loco::Graph *g)
+bool FuseBCQPass::run(luci::Module *m)
 {
   bool changed = false;
 
   const int32_t start_magicnum = -2e9 + 27;
   const int32_t end_magicnum = 2e9 - 27;
 
+  loco::Graph *main_graph = m->graph(0);
+
   luci::CircleConst *metadata_node = nullptr;
-  for (auto node : loco::output_nodes(g))
+  for (auto node : loco::output_nodes(main_graph))
   {
     auto output_node = loco::must_cast<luci::CircleOutput *>(node);
 
@@ -474,8 +632,11 @@ bool FuseBCQPass::run(loco::Graph *g)
       const auto bundle_cnt = metadata_node->at<loco::DataType::S32>(3);
 
       BCQFuser<1> fuser{original_output_cnt, bundle_cnt};
-      if (fuser.fuseBCQ(g))
-        changed = true;
+      fuser.register_bcq_info(main_graph);
+
+      for (size_t g = 0; g < m->size(); ++g)
+        if (fuser.fuseBCQ(m->graph(g)))
+          changed = true;
     }
     else
     {
@@ -486,12 +647,12 @@ bool FuseBCQPass::run(loco::Graph *g)
     // Remove all of BCQ information nodes iff there is no change
     if (changed == false)
     {
-      for (auto node : loco::output_nodes(g))
+      for (auto node : loco::output_nodes(main_graph))
       {
         auto output_node = loco::must_cast<luci::CircleOutput *>(node);
         if (output_node->index() == 0 || (int)output_node->index() > original_output_cnt)
         {
-          auto noOp = g->nodes()->create<luci::CircleOutputExclude>();
+          auto noOp = main_graph->nodes()->create<luci::CircleOutputExclude>();
           noOp->dtype(loco::DataType::FLOAT32); // TODO Remove this setting
           output_node->from(noOp);
           changed = true;
@@ -503,4 +664,10 @@ bool FuseBCQPass::run(loco::Graph *g)
   return changed;
 }
 
+bool FuseBCQPass::run(loco::Graph *)
+{
+  // Do nothing for graph
+  return false;
+}
+
 } // namespace luci
diff --git a/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp b/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp
new file mode 100644
index 0000000..beb962a
--- /dev/null
+++ b/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/MigrateLegacyShapeDtypePass.h"
+
+#include <loco/Service/ShapeInference.h>
+#include <loco/Service/TypeInference.h>
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco.h>
+
+namespace
+{
+
+bool has_same_shape(luci::CircleNode *node, loco::TensorShape shape)
+{
+  if (node->rank() != shape.rank())
+    return false;
+
+  for (uint32_t i = 0; i < shape.rank(); ++i)
+    if (!(node->dim(i) == shape.dim(i)))
+      return false;
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool MigrateLegacyShapeDtypePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
+bool MigrateLegacyShapeDtypePass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::all_nodes(g))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (loco::shape_known(node))
+    {
+      auto loco_shape = loco::shape_get(node).as<loco::TensorShape>();
+
+      assert(circle_node->shape_signature().rank() == 0 ||
+             circle_node->shape_signature().rank() == loco_shape.rank());
+
+      // When shape of loco is copied to circle node, ShapeSignature should be applied.
+      loco::TensorShape new_shape;
+      new_shape.rank(loco_shape.rank());
+      for (uint32_t i = 0; i < loco_shape.rank(); ++i)
+      {
+        if (circle_node->shape_signature().rank() > 0 &&
+            circle_node->shape_signature().dim(i) == -1)
+          new_shape.dim(i) = 1;
+        else
+          new_shape.dim(i) = loco_shape.dim(i);
+      }
+
+      if (circle_node->shape_status() == luci::ShapeStatus::UNDEFINED ||
+          !has_same_shape(circle_node, new_shape))
+      {
+        circle_node->rank(new_shape.rank());
+        for (uint32_t i = 0; i < new_shape.rank(); ++i)
+          circle_node->dim(i) = new_shape.dim(i);
+
+        if (circle_node->shape_status() == luci::ShapeStatus::UNDEFINED)
+          circle_node->shape_status(luci::ShapeStatus::VALID);
+
+        changed = true;
+      }
+    }
+
+    if (loco::dtype_known(node))
+    {
+      if (loco::dtype_get(node) != circle_node->dtype())
+      {
+        circle_node->dtype(loco::dtype_get(node));
+        changed = true;
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ModulePhase.cpp b/compiler/luci/pass/src/ModulePhase.cpp
new file mode 100644
index 0000000..46819a0
--- /dev/null
+++ b/compiler/luci/pass/src/ModulePhase.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModulePhase.h"
+
+namespace luci
+{
+
+void PhaseRunner<logo::PhaseStrategy::Saturate>::run(const Phase &phase) const
+{
+  notifyPhaseBegin();
+
+  for (bool changed = true; changed;)
+  {
+    changed = false;
+
+    for (auto &pass : phase)
+    {
+      notifyPassBegin(pass.get());
+
+      bool pass_changed = pass->run(_module);
+      changed = changed || pass_changed;
+
+      notifyPassEnd(pass.get(), pass_changed);
+    }
+  }
+
+  notifyPhaseEnd();
+}
+
+void PhaseRunner<logo::PhaseStrategy::Restart>::run(const Phase &phase) const
+{
+  notifyPhaseBegin();
+
+  for (bool changed = true; changed;)
+  {
+    changed = false;
+
+    for (auto &pass : phase)
+    {
+      notifyPassBegin(pass.get());
+
+      bool pass_changed = pass->run(_module);
+      changed = changed || pass_changed;
+
+      notifyPassEnd(pass.get(), pass_changed);
+
+      if (changed)
+      {
+        break;
+      }
+    }
+  }
+
+  notifyPhaseEnd();
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ModulePhase.h b/compiler/luci/pass/src/ModulePhase.h
new file mode 100644
index 0000000..05966cc
--- /dev/null
+++ b/compiler/luci/pass/src/ModulePhase.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MODULE_PHASE_H__
+#define __MODULE_PHASE_H__
+
+#include <luci/ModulePass.h>
+
+#include <logo/Phase.h>
+
+#include <vector>
+
+namespace luci
+{
+
+using Phase = std::vector<std::unique_ptr<Pass>>;
+
+template <logo::PhaseStrategy S> class PhaseRunner;
+
+template <>
+class PhaseRunner<logo::PhaseStrategy::Saturate> final : public logo::PhaseRunnerMixinObservable
+{
+public:
+  PhaseRunner(luci::Module *module) : _module{module}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void run(const Phase &) const;
+
+private:
+  luci::Module *_module;
+};
+
+template <>
+class PhaseRunner<logo::PhaseStrategy::Restart> final : public logo::PhaseRunnerMixinObservable
+{
+public:
+  PhaseRunner(luci::Module *module) : _module{module}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void run(const Phase &) const;
+
+private:
+  luci::Module *_module;
+};
+
+} // namespace luci
+
+#endif // __MODULE_PHASE_H__
diff --git a/compiler/luci/pass/src/ProgressReporter.cpp b/compiler/luci/pass/src/ProgressReporter.cpp
index dcf47ab..515739d 100644
--- a/compiler/luci/pass/src/ProgressReporter.cpp
+++ b/compiler/luci/pass/src/ProgressReporter.cpp
@@ -81,4 +81,46 @@ void ProgressReporter::notify(const logo::PhaseEventInfo<logo::PhaseEvent::PassE
   INFO(prime) << luci::fmt(graph());
 }
 
+void ModuleProgressReporter::notify(const logo::PhaseEventInfo<logo::PhaseEvent::PhaseBegin> *)
+{
+  LOGGER(prime);
+
+  INFO(prime) << "==============================================================";
+  INFO(prime) << "ModulePhaseRunner<" << to_str(strategy()) << ">";
+  INFO(prime) << "Initial graphs";
+  for (size_t g = 0; g < module()->size(); ++g)
+  {
+    INFO(prime) << "graphs #" << g;
+    INFO(prime) << luci::fmt(module()->graph(g));
+  }
+}
+
+void ModuleProgressReporter::notify(const logo::PhaseEventInfo<logo::PhaseEvent::PhaseEnd> *)
+{
+  LOGGER(prime);
+
+  INFO(prime) << "ModulePhaseRunner<" << to_str(strategy()) << "> - done";
+}
+
+void ModuleProgressReporter::notify(const logo::PhaseEventInfo<logo::PhaseEvent::PassBegin> *info)
+{
+  LOGGER(prime);
+
+  INFO(prime) << "--------------------------------------------------------------";
+  INFO(prime) << "Before " << logo::pass_name(info->pass());
+}
+
+void ModuleProgressReporter::notify(const logo::PhaseEventInfo<logo::PhaseEvent::PassEnd> *info)
+{
+  LOGGER(prime);
+
+  INFO(prime) << "After " << logo::pass_name(info->pass())
+              << " (changed: " << to_char(info->changed()) << ")";
+  for (size_t g = 0; g < module()->size(); ++g)
+  {
+    INFO(prime) << "graphs #" << g;
+    INFO(prime) << luci::fmt(module()->graph(g));
+  }
+}
+
 } // namespace luci
diff --git a/compiler/luci/pass/src/ProgressReporter.h b/compiler/luci/pass/src/ProgressReporter.h
index bd2ba98..cf30da7 100644
--- a/compiler/luci/pass/src/ProgressReporter.h
+++ b/compiler/luci/pass/src/ProgressReporter.h
@@ -21,6 +21,8 @@
 
 #include <loco.h>
 
+#include <luci/IR/Module.h>
+
 namespace luci
 {
 
@@ -48,6 +50,30 @@ private:
   logo::PhaseStrategy _strategy;
 };
 
+class ModuleProgressReporter : public logo::PhaseEventListener
+{
+public:
+  ModuleProgressReporter(luci::Module *module, logo::PhaseStrategy strategy)
+      : _module{module}, _strategy{strategy}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void notify(const logo::PhaseEventInfo<logo::PhaseEvent::PhaseBegin> *) override;
+  void notify(const logo::PhaseEventInfo<logo::PhaseEvent::PhaseEnd> *) override;
+  void notify(const logo::PhaseEventInfo<logo::PhaseEvent::PassBegin> *) override;
+  void notify(const logo::PhaseEventInfo<logo::PhaseEvent::PassEnd> *) override;
+
+public:
+  luci::Module *module(void) const { return _module; }
+  logo::PhaseStrategy strategy(void) const { return _strategy; }
+
+private:
+  luci::Module *_module;
+  logo::PhaseStrategy _strategy;
+};
+
 } // namespace luci
 
 #endif // __LUCI_PROGRESSREPORTER_H__
diff --git a/compiler/luci/pass/src/PropagateQuantParamPass.cpp b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
new file mode 100644
index 0000000..af83cd8
--- /dev/null
+++ b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/PropagateQuantParamPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Log.h>
+
+#include <iostream>
+
+namespace
+{
+
+bool copy_qparam(luci::CircleQuantParam *src, luci::CircleQuantParam *dst)
+{
+  assert(src->scale.size() == dst->scale.size());
+  assert(src->zerop.size() == dst->zerop.size());
+
+  // src and dst have the same qparam
+  if (std::equal(src->scale.begin(), src->scale.end(), dst->scale.begin()) &&
+      std::equal(src->zerop.begin(), src->zerop.end(), dst->zerop.begin()) &&
+      src->quantized_dimension == dst->quantized_dimension)
+    return false;
+
+  dst->scale.assign(src->scale.begin(), src->scale.end());
+  dst->zerop.assign(src->zerop.begin(), src->zerop.end());
+  dst->quantized_dimension = src->quantized_dimension;
+  return true;
+}
+
+bool copy_qparam(luci::CircleNode *src, luci::CircleNode *dst)
+{
+  // Skip nodes that do not have quantparams
+  auto src_qparam = src->quantparam();
+  if (not src_qparam)
+    return false;
+
+  auto dst_qparam = dst->quantparam();
+  if (not dst_qparam)
+    return false;
+
+  return copy_qparam(src_qparam, dst_qparam);
+}
+
+//  Visitor to propagate quantization parameters
+struct PropagateQuantParam final : public luci::CircleNodeMutableVisitor<bool>
+{
+  PropagateQuantParam() = default;
+
+  bool visit(luci::CircleNode *) { return false; }
+
+  bool visit(luci::CircleReshape *node)
+  {
+    auto input = node->tensor();
+    if (loco::succs(input).size() != 1)
+      return false;
+
+    auto input_node = loco::must_cast<luci::CircleNode *>(input);
+    return copy_qparam(node, input_node);
+  }
+
+  // TODO : Add more Ops (e.g., Transpose)
+};
+
+} // namespace
+
+namespace luci
+{
+
+bool PropagateQuantParamPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  LOGGER(l);
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    INFO(l) << "PropagateQuantParamPass visit node: " << circle_node->name() << std::endl;
+
+    PropagateQuantParam pqp;
+    changed = circle_node->accept(&pqp);
+    if (changed)
+      break;
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp b/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp
new file mode 100644
index 0000000..15adbfc
--- /dev/null
+++ b/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/PropagateQuantParamPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+void addQuantParam(luci::CircleNode *node, const std::vector<float> &scale,
+                   const std::vector<int64_t> &zp)
+{
+  assert(node->quantparam() == nullptr);
+
+  auto quantparam = std::make_unique<luci::CircleQuantParam>();
+  quantparam->scale = scale;
+  quantparam->zerop = zp;
+  node->quantparam(std::move(quantparam));
+}
+
+/**
+ *  Simple graph for test
+ *
+ *  BEFORE
+ *
+ *        [Conv] (qparam 1)
+ *           |
+ *       [Reshape] (qparam 2)
+ *
+ *  AFTER
+ *
+ *        [Conv] (qparam 2)
+ *           |
+ *       [Reshape] (qparam 2)
+ *
+ */
+class SimpleGraph
+{
+public:
+  SimpleGraph()
+  {
+    input = g.nodes()->create<luci::CircleInput>();
+    conv = g.nodes()->create<luci::CircleConv2D>();
+    reshape = g.nodes()->create<luci::CircleReshape>();
+    output = g.nodes()->create<luci::CircleOutput>();
+
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+
+    addQuantParam(conv, {0.1, 0.2, 0.3}, {0, 10, 20});
+    addQuantParam(reshape, {0.2, 0.4, 0.6}, {-10, 0, 10});
+
+    conv->input(input);
+    reshape->tensor(conv);
+    output->from(reshape);
+  }
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input;
+  luci::CircleConv2D *conv;
+  luci::CircleReshape *reshape;
+  luci::CircleOutput *output;
+};
+
+} // namespace
+
+TEST(PropagateQuantParam, simple)
+{
+  SimpleGraph g;
+
+  luci::PropagateQuantParamPass pass;
+  while (pass.run(&g.g))
+    ;
+
+  EXPECT_FLOAT_EQ(0.2, g.conv->quantparam()->scale[0]);
+  EXPECT_FLOAT_EQ(0.4, g.conv->quantparam()->scale[1]);
+  EXPECT_FLOAT_EQ(0.6, g.conv->quantparam()->scale[2]);
+  EXPECT_EQ(-10, g.conv->quantparam()->zerop[0]);
+  EXPECT_EQ(0, g.conv->quantparam()->zerop[1]);
+  EXPECT_EQ(10, g.conv->quantparam()->zerop[2]);
+}
+
+TEST(PropagateQuantParam, wrong_op_NEG)
+{
+  SimpleGraph g;
+  g.output->from(g.conv);
+  g.reshape->drop();
+
+  luci::PropagateQuantParamPass pass;
+  while (pass.run(&g.g))
+    ;
+
+  EXPECT_FLOAT_EQ(0.1, g.conv->quantparam()->scale[0]);
+  EXPECT_FLOAT_EQ(0.2, g.conv->quantparam()->scale[1]);
+  EXPECT_FLOAT_EQ(0.3, g.conv->quantparam()->scale[2]);
+  EXPECT_EQ(0, g.conv->quantparam()->zerop[0]);
+  EXPECT_EQ(10, g.conv->quantparam()->zerop[1]);
+  EXPECT_EQ(20, g.conv->quantparam()->zerop[2]);
+}
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index 0ecab00..f6eebe3 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -86,6 +86,100 @@ void quant_const_values(luci::CircleConst *const_node, float scaling_factor, flo
   }
 }
 
+// Quantize const per channel
+//
+// The last dimension of const is the same as the dimension of channel
+// And the rest of the const dimensions should be 1
+// So, a 'single value' is quantized per channel
+//
+// Quantization spec (f: fp value, q: quantized value)
+//
+// uint8
+//   Positive f: f = f * (q - 0) [q = 1, scale = f, zp = 0]
+//   Negative f: f = (-f) * (q - 1) [q = 0, scale = -f, zp = 1]
+//
+// int16
+//   Positive f: f = f * (q - 0) [q = 1, scale = f, zp = 0]
+//   Negative f: f = (-f) * (q - 0) [q = -1, scale = -f, zp = 0]
+void quant_const_per_channel(CircleConst *node, loco::DataType quant_type)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+  assert(node->rank() > 0);
+
+  for (uint32_t i = 0; i < node->rank() - 1; i++)
+  {
+    // Caller should call this function when the below condition is satisfied
+    if (node->dim(i).value() != 1)
+      throw std::runtime_error("Non-channel dimension of const node must be 1");
+  }
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  assert(size == node->dim(node->rank() - 1).value());
+
+  auto quantparam = std::make_unique<CircleQuantParam>();
+  quantparam->quantized_dimension = node->rank() - 1;
+  std::vector<int32_t> quantized_data(size);
+
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    if (quant_type == loco::DataType::U8)
+    {
+      if (data >= 0)
+      {
+        quantparam->scale.push_back(data);
+        quantparam->zerop.push_back(0);
+        quantized_data[i] = 1;
+      }
+      else
+      {
+        quantparam->scale.push_back(-data);
+        quantparam->zerop.push_back(1);
+        quantized_data[i] = 0;
+      }
+    }
+    else if (quant_type == loco::DataType::S16)
+    {
+      if (data >= 0)
+      {
+        quantparam->scale.push_back(data);
+        quantized_data[i] = 1;
+      }
+      else
+      {
+        quantparam->scale.push_back(-data);
+        quantized_data[i] = -1;
+      }
+      quantparam->zerop.push_back(0);
+    }
+  }
+  node->quantparam(std::move(quantparam));
+
+  switch (quant_type)
+  {
+    case loco::DataType::U8:
+      node->dtype(loco::DataType::U8);
+      node->size<loco::DataType::U8>(size);
+      for (uint32_t i = 0; i < size; ++i)
+      {
+        assert(quantized_data[i] == 0 || quantized_data[i] == 1);
+        node->at<loco::DataType::U8>(i) = quantized_data[i];
+      }
+      break;
+    case loco::DataType::S16:
+      node->dtype(loco::DataType::S16);
+      node->size<loco::DataType::S16>(size);
+      for (uint32_t i = 0; i < size; ++i)
+      {
+        assert(quantized_data[i] == -1 || quantized_data[i] == 1);
+        node->at<loco::DataType::S16>(i) = quantized_data[i];
+      }
+      break;
+    default:
+      throw std::runtime_error("Unsupported data type");
+  }
+}
+
 void quant_const(CircleConst *node, loco::DataType quant_type)
 {
   assert(node->dtype() == loco::DataType::FLOAT32);
@@ -612,10 +706,51 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
   }
 };
 
+void quant_instnorm(luci::CircleInstanceNorm *node, loco::DataType output_type,
+                    QuantizationGranularity granularity)
+{
+  auto gamma = loco::must_cast<luci::CircleConst *>(node->gamma());
+  auto beta = loco::must_cast<luci::CircleConst *>(node->beta());
+  assert(gamma->dtype() == loco::DataType::FLOAT32);
+  assert(beta->dtype() == loco::DataType::FLOAT32);
+
+  if (granularity == QuantizationGranularity::LayerWise)
+  {
+    quant_const(gamma, output_type);
+    quant_const(beta, output_type);
+  }
+  else if (granularity == QuantizationGranularity::ChannelWise)
+  {
+    quant_const_per_channel(gamma, output_type);
+    quant_const_per_channel(beta, output_type);
+  }
+  else
+    throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
+}
+
+void quant_prelu(luci::CirclePRelu *node, loco::DataType output_type,
+                 QuantizationGranularity granularity)
+{
+  auto alpha = loco::must_cast<luci::CircleConst *>(node->alpha());
+  assert(alpha->dtype() == loco::DataType::FLOAT32);
+
+  if (granularity == QuantizationGranularity::LayerWise)
+  {
+    quant_const(alpha, output_type);
+  }
+  else if (granularity == QuantizationGranularity::ChannelWise)
+  {
+    quant_const_per_channel(alpha, output_type);
+  }
+  else
+    throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
+}
+
 /**
  * @brief Quantize const input tensors using min/max of const values
  */
-void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
+void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
+                           QuantizationGranularity granularity)
 {
   auto opcode = node->opcode();
   auto arity = node->arity();
@@ -660,20 +795,26 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
         quant_const(const_node, output_type);
       break;
 
+    case luci::CircleOpcode::INSTANCE_NORM:
+      quant_instnorm(loco::must_cast<luci::CircleInstanceNorm *>(node), output_type, granularity);
+      break;
+
+    case luci::CircleOpcode::PRELU:
+      quant_prelu(loco::must_cast<luci::CirclePRelu *>(node), output_type, granularity);
+      break;
+
     case luci::CircleOpcode::ADD:
     case luci::CircleOpcode::ADD_N:
     case luci::CircleOpcode::DIV:
     case luci::CircleOpcode::EQUAL:
     case luci::CircleOpcode::GREATER:
     case luci::CircleOpcode::GREATER_EQUAL:
-    case luci::CircleOpcode::INSTANCE_NORM:
     case luci::CircleOpcode::LESS:
     case luci::CircleOpcode::LESS_EQUAL:
     case luci::CircleOpcode::MAXIMUM:
     case luci::CircleOpcode::MINIMUM:
     case luci::CircleOpcode::MUL:
     case luci::CircleOpcode::NOT_EQUAL:
-    case luci::CircleOpcode::PRELU:
     case luci::CircleOpcode::SUB:
       // Quantize all const inputs using their values
       for (uint32_t i = 0; i < arity; i++)
@@ -817,7 +958,7 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    quantize_const_inputs(circle_node, _output_dtype);
+    quantize_const_inputs(circle_node, _output_dtype, _granularity);
   }
 
   // Propagate quantization parameters of concat Op
diff --git a/compiler/luci/pass/src/RemoveRedundantTranspose.cpp b/compiler/luci/pass/src/RemoveRedundantTranspose.cpp
new file mode 100644
index 0000000..33cb765
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantTranspose.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveRedundantTransposePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+/// @brief Return true if first_perm[second_perm[i]] == i
+bool check_perm(const luci::CircleConst *first_perm, const luci::CircleConst *second_perm)
+{
+  assert(first_perm->rank() == 1);
+  assert(second_perm->rank() == 1);
+  assert(second_perm->size<loco::DataType::S32>() == first_perm->size<loco::DataType::S32>());
+  for (int32_t i = 0; i < static_cast<int32_t>(first_perm->size<loco::DataType::S32>()); i++)
+  {
+    if (first_perm->at<loco::DataType::S32>(second_perm->at<loco::DataType::S32>(i)) != i)
+      return false;
+  }
+  return true;
+}
+
+bool remove_consecutive_transpose_function(luci::CircleNode *node)
+{
+  auto target_node = dynamic_cast<luci::CircleTranspose *>(node);
+  if (target_node == nullptr)
+    return false;
+  auto pred_node = dynamic_cast<luci::CircleTranspose *>(target_node->a());
+  if (pred_node == nullptr)
+    return false;
+  if (loco::succs(pred_node).size() != 1)
+    return false;
+
+  auto pred_perm = dynamic_cast<luci::CircleConst *>(target_node->perm());
+  if (pred_perm == nullptr)
+    return false;
+
+  auto main_perm = dynamic_cast<luci::CircleConst *>(pred_node->perm());
+  if (main_perm == nullptr)
+    return false;
+
+  auto main_node = loco::must_cast<luci::CircleNode *>(pred_node->a());
+  if (check_perm(pred_perm, main_perm))
+  {
+    replace(node).with(main_node);
+  }
+  else
+  {
+    auto g = main_perm->graph();
+    auto new_const_node = g->nodes()->create<luci::CircleConst>();
+
+    new_const_node->dtype(loco::DataType::S32);
+    new_const_node->rank(1);
+    new_const_node->dim(0) = main_perm->dim(0);
+    new_const_node->size<loco::DataType::S32>(main_perm->dim(0).value());
+    new_const_node->shape_status(luci::ShapeStatus::VALID);
+    for (uint32_t i = 0; i < main_perm->size<loco::DataType::S32>(); i++)
+    {
+      new_const_node->at<loco::DataType::S32>(i) =
+          pred_perm->at<loco::DataType::S32>(main_perm->at<loco::DataType::S32>(i));
+    }
+    pred_node->perm(new_const_node);
+    replace(node).with(pred_node);
+  }
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+/**
+ *  BEFORE
+ *         |
+ *   [CircleNode]     [CircleConst]
+ *    (main_node)      (main_perm)
+ *         \               /
+ *         [CircleTranspose]  [CircleConst]
+ *            (pred_node)      (pred_perm)
+ *                 \               /
+ *                 [CircleTranspose]
+ *                   (target_node)
+ *                         |
+ *
+ *  AFTER
+ *      <Optional Case>
+ *
+ *          |                 |                   |
+ *    [CircleNode]      [CircleConst]             |
+ *     (main_node)     (new_const_node)           |
+ *           \               /           or  [CircleNode]
+ *           [CircleTranspose]                (main_node)
+ *              (pred_node)                       |
+ *                   |                            |
+ *
+ */
+bool RemoveRedundantTransposePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (remove_consecutive_transpose_function(circle_node))
+    {
+      changed = true;
+      break;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp b/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp
new file mode 100644
index 0000000..db608b6
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveRedundantTransposePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+void setValue(luci::CircleConst *node, const std::vector<int> &v)
+{
+  node->dtype(loco::DataType::S32);
+  node->size<loco::DataType::S32>(v.size());
+  node->rank(1);
+  node->dim(0).set(v.size());
+  for (int i = 0; i < v.size(); ++i)
+  {
+    node->at<loco::DataType::S32>(i) = v[i];
+  }
+}
+
+/**
+ *  Type1
+ *  BEFORE
+ *         |
+ *   [CircleNode]     [CircleConst]
+ *           \              /
+ *           [CircleTranspose]  [CircleConst]
+ *                   \              /
+ *                   [CircleTranspose]
+ *                           |
+ *
+ *  AFTER
+ *         |
+ *   [CircleNode]
+ *         |   Remove Both
+ *
+ * --------------------------------------------
+ *
+ *  Type2
+ *  BEFORE
+ *         |
+ *   [CircleNode]     [CircleConst]
+ *           \              /
+ *           [CircleTranspose]  [CircleConst]
+ *                   \               /
+ *                   [CircleTranspose]
+ *                           |
+ *
+ *  AFTER
+ *          |                 |
+ *    [CircleNode]      [CircleConst]
+ *           \               /
+ *           [CircleTranspose]
+ *                   |
+ *
+ */
+void create_redundunt_transpose(loco::Graph *g, const std::vector<int32_t> &perm1,
+                                const std::vector<int32_t> &perm2)
+{
+  assert(g);
+
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto graph_input = g->inputs()->create();
+  input->index(graph_input->index());
+
+  // Create perm1
+  auto perm1_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm1_node, perm1);
+
+  auto transpose1 = g->nodes()->create<luci::CircleTranspose>();
+  transpose1->dtype(loco::DataType::FLOAT32);
+  transpose1->a(input);
+  transpose1->perm(perm1_node);
+
+  // Create perm2
+  auto perm2_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm2_node, perm2);
+
+  auto transpose2 = g->nodes()->create<luci::CircleTranspose>();
+  transpose2->dtype(loco::DataType::FLOAT32);
+  transpose2->a(transpose1);
+  transpose2->perm(perm2_node);
+
+  // Output
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(transpose2);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+}
+
+} // namespace
+
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type1)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose(graph.get(), {1, 0, 2, 3}, {1, 0, 2, 3});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  // No transpose node is in graph.
+  ASSERT_EQ(nullptr, transpose_node);
+}
+
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type2)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose(graph.get(), {0, 1, 3, 2}, {1, 0, 2, 3});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  // Just one transpose node, with updated perm constant.
+  ASSERT_NE(nullptr, transpose_node);
+  auto perm = loco::must_cast<luci::CircleConst *>(transpose_node->perm());
+  ASSERT_EQ(1, perm->at<loco::DataType::S32>(0));
+  ASSERT_EQ(0, perm->at<loco::DataType::S32>(1));
+  ASSERT_EQ(3, perm->at<loco::DataType::S32>(2));
+  ASSERT_EQ(2, perm->at<loco::DataType::S32>(3));
+}
diff --git a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp
new file mode 100644
index 0000000..7096c25
--- /dev/null
+++ b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+luci::CircleConst *create_weights_from_gamma(luci::CircleConst *gamma)
+{
+  assert(gamma->rank() == 1);
+  auto channel_size = gamma->dim(0).value();
+
+  // Channel-wise MUL is the same as DEPTHWISE_CONV2D with filter shape (1,1,1,channel_size)
+  auto weights = gamma->graph()->nodes()->create<luci::CircleConst>();
+  weights->dtype(loco::DataType::FLOAT32);
+  weights->rank(4);
+  weights->dim(0).set(1);
+  weights->dim(1).set(1);
+  weights->dim(2).set(1);
+  weights->dim(3).set(channel_size);
+  weights->shape_status(luci::ShapeStatus::VALID);
+  weights->size<loco::DataType::FLOAT32>(channel_size);
+  for (uint32_t i = 0; i < channel_size; i++)
+  {
+    weights->at<loco::DataType::FLOAT32>(i) = gamma->at<loco::DataType::FLOAT32>(i);
+  }
+
+  return weights;
+}
+
+luci::CircleConst *create_bias_from_beta(luci::CircleConst *beta)
+{
+  assert(beta->rank() == 1);
+  auto channel_size = beta->dim(0).value();
+
+  // Channel-wise ADD is the same as bias (shape = (channel_size)) of DEPTHWISE_CONV2D
+  auto bias = beta->graph()->nodes()->create<luci::CircleConst>();
+  bias->dtype(loco::DataType::FLOAT32);
+  bias->rank(1);
+  bias->dim(0).set(channel_size);
+  bias->size<loco::DataType::FLOAT32>(channel_size);
+  bias->shape_status(luci::ShapeStatus::VALID);
+  for (uint32_t i = 0; i < channel_size; i++)
+  {
+    bias->at<loco::DataType::FLOAT32>(i) = beta->at<loco::DataType::FLOAT32>(i);
+  }
+
+  return bias;
+}
+
+bool is_batchnorm_add(const luci::CircleAdd *add, luci::CircleMul *&mul, luci::CircleConst *&beta)
+{
+  auto x = loco::must_cast<luci::CircleNode *>(add->x());
+  auto y = loco::must_cast<luci::CircleNode *>(add->y());
+
+  luci::CircleMul *pred = nullptr;
+  luci::CircleConst *constant = nullptr;
+
+  if (x->opcode() == luci::CircleOpcode::CIRCLECONST && y->opcode() == luci::CircleOpcode::MUL)
+  {
+    pred = loco::must_cast<luci::CircleMul *>(y);
+    constant = loco::must_cast<luci::CircleConst *>(x);
+  }
+  else if (x->opcode() == luci::CircleOpcode::MUL && y->opcode() == luci::CircleOpcode::CIRCLECONST)
+  {
+    pred = loco::must_cast<luci::CircleMul *>(x);
+    constant = loco::must_cast<luci::CircleConst *>(y);
+  }
+  else
+  {
+    return false;
+  }
+
+  if (constant->rank() != 1)
+    return false;
+
+  auto channel_dim = constant->dim(0);
+  // Assumption: Layout is channel-last
+  if (!(channel_dim == add->dim(add->rank() - 1)))
+    return false;
+
+  mul = pred;
+  beta = constant;
+  return true;
+}
+
+// Check if mul is batchnorm mul
+bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleNode *&pred_node,
+                      luci::CircleConst *&gamma)
+{
+  auto x = dynamic_cast<luci::CircleConst *>(mul->x());
+  auto y = dynamic_cast<luci::CircleConst *>(mul->y());
+
+  luci::CircleNode *pred = nullptr;
+  luci::CircleConst *constant = nullptr;
+
+  if (x != nullptr && y == nullptr)
+  {
+    pred = loco::must_cast<luci::CircleNode *>(mul->y());
+    constant = x;
+  }
+  else if (x == nullptr && y != nullptr)
+  {
+    pred = loco::must_cast<luci::CircleNode *>(mul->x());
+    constant = y;
+  }
+  else
+  {
+    return false;
+  }
+
+  if (constant->rank() != 1)
+    return false;
+
+  auto channel_dim = constant->dim(0);
+  if (!(channel_dim == mul->dim(mul->rank() - 1)))
+    return false;
+
+  pred_node = pred;
+  gamma = constant;
+  return true;
+}
+
+/**
+ *  Replace channel-wise Mul/Add with DepthwiseConv2D
+ *
+ *  BEFORE
+ *
+ *             [Node] [gamma]
+ *                |  /
+ *              [Mul]  [beta]
+ *                |   /
+ *               [Add]
+ *
+ *  AFTER
+ *
+ *              [Node]  [weights]  [bias]
+ *                  \      /       /
+ *                [DepthwiseConv2D]
+ */
+bool replace_mul_add_with_dwconv(luci::CircleAdd *add)
+{
+  luci::CircleNode *pred_node = nullptr;
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *beta = nullptr;
+  luci::CircleConst *gamma = nullptr;
+
+  if (!is_batchnorm_add(add, mul, beta))
+    return false;
+
+  if (loco::succs(mul).size() != 1)
+    return false;
+
+  if (!is_batchnorm_mul(mul, pred_node, gamma))
+    return false;
+
+  if (pred_node->rank() != 4)
+    return false;
+
+  if (pred_node->dtype() != loco::DataType::FLOAT32 || beta->dtype() != loco::DataType::FLOAT32 ||
+      gamma->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  auto weights = create_weights_from_gamma(gamma);
+  auto bias = create_bias_from_beta(beta);
+
+  auto dwconv = add->graph()->nodes()->create<luci::CircleDepthwiseConv2D>();
+  dwconv->input(pred_node);
+  dwconv->filter(weights);
+  dwconv->bias(bias);
+  dwconv->padding(luci::Padding::SAME);
+  dwconv->stride()->w(1);
+  dwconv->stride()->h(1);
+  dwconv->depthMultiplier(1);
+  dwconv->dilation()->w(1);
+  dwconv->dilation()->h(1);
+  dwconv->fusedActivationFunction(add->fusedActivationFunction());
+
+  loco::replace(add).with(dwconv);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ReplaceMulAddWithDepthwiseConvPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto add = dynamic_cast<luci::CircleAdd *>(node);
+    if (not add)
+      continue;
+
+    if (replace_mul_add_with_dwconv(add))
+    {
+      changed = true;
+      break;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp
new file mode 100644
index 0000000..a90182a
--- /dev/null
+++ b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Simple graph for test
+ *
+ *  BEFORE
+ *
+ *             [Node] [gamma]
+ *                |  /
+ *              [Mul]  [beta]
+ *                |   /
+ *               [Add]
+ *
+ *  AFTER
+ *
+ *              [Node]  [weights]  [bias]
+ *                  \      /       /
+ *                [DepthwiseConv2D]
+ */
+class SimpleGraph
+{
+public:
+  SimpleGraph()
+  {
+    input = g.nodes()->create<luci::CircleInput>();
+    mul = g.nodes()->create<luci::CircleMul>();
+    gamma = g.nodes()->create<luci::CircleConst>();
+    add = g.nodes()->create<luci::CircleAdd>();
+    beta = g.nodes()->create<luci::CircleConst>();
+    output = g.nodes()->create<luci::CircleOutput>();
+
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+
+    input->dtype(loco::DataType::FLOAT32);
+    mul->dtype(loco::DataType::FLOAT32);
+    gamma->dtype(loco::DataType::FLOAT32);
+    add->dtype(loco::DataType::FLOAT32);
+    beta->dtype(loco::DataType::FLOAT32);
+    output->dtype(loco::DataType::FLOAT32);
+
+    uint32_t channel_size = 16;
+    input->shape({1, 4, 4, channel_size});
+    mul->shape({1, 4, 4, channel_size});
+    gamma->shape({channel_size});
+    add->shape({1, 4, 4, channel_size});
+    beta->shape({channel_size});
+    output->shape({1, 4, 4, channel_size});
+
+    gamma->size<loco::DataType::FLOAT32>(channel_size);
+    beta->size<loco::DataType::FLOAT32>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      gamma->at<loco::DataType::FLOAT32>(i) = i;
+      beta->at<loco::DataType::FLOAT32>(i) = i;
+    }
+
+    mul->x(input);
+    mul->y(gamma);
+    add->x(mul);
+    add->y(beta);
+    output->from(add);
+  }
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *gamma = nullptr;
+  luci::CircleAdd *add = nullptr;
+  luci::CircleConst *beta = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+} // namespace
+
+TEST(ReplaceMulAddWithDepthwiseConv, simple)
+{
+  SimpleGraph g;
+
+  luci::ReplaceMulAddWithDepthwiseConvPass pass;
+  while (pass.run(&g.g))
+    ;
+
+  auto dwconv = dynamic_cast<luci::CircleDepthwiseConv2D *>(g.output->from());
+  EXPECT_NE(nullptr, dwconv);
+
+  uint32_t channel_size = 16;
+  auto weights = dynamic_cast<luci::CircleConst *>(dwconv->filter());
+  auto bias = dynamic_cast<luci::CircleConst *>(dwconv->bias());
+  EXPECT_NE(nullptr, weights);
+  EXPECT_EQ(4, weights->rank());
+  EXPECT_EQ(channel_size, weights->dim(3).value());
+  EXPECT_NE(nullptr, bias);
+  EXPECT_EQ(1, bias->rank());
+  EXPECT_EQ(channel_size, bias->dim(0).value());
+
+  for (int i = 0; i < channel_size; i++)
+  {
+    EXPECT_FLOAT_EQ(i, weights->at<loco::DataType::FLOAT32>(i));
+    EXPECT_FLOAT_EQ(i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+TEST(ReplaceMulAddWithDepthwiseConv, wrong_op_NEG)
+{
+  SimpleGraph g;
+  // swap mul/add (changed to add->mul)
+  g.add->x(g.input);
+  loco::replace(g.add).with(g.mul);
+  g.mul->x(g.add);
+
+  luci::ReplaceMulAddWithDepthwiseConvPass pass;
+  auto changed = pass.run(&g.g);
+
+  EXPECT_EQ(false, changed);
+}
diff --git a/compiler/luci/pass/src/ShapeInferencePass.cpp b/compiler/luci/pass/src/ShapeInferencePass.cpp
index f681b3d..4bd0aae 100644
--- a/compiler/luci/pass/src/ShapeInferencePass.cpp
+++ b/compiler/luci/pass/src/ShapeInferencePass.cpp
@@ -28,6 +28,19 @@
 namespace luci
 {
 
+bool ShapeInferencePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
 bool ShapeInferencePass::run(loco::Graph *g)
 {
   loco::CanonicalShapeInferenceRule canonical_rule;
diff --git a/compiler/luci/pass/src/ShapeSignatureInferencePass.cpp b/compiler/luci/pass/src/ShapeSignatureInferencePass.cpp
new file mode 100644
index 0000000..115b77a
--- /dev/null
+++ b/compiler/luci/pass/src/ShapeSignatureInferencePass.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ShapeSignatureInferencePass.h"
+
+#include <luci/IR/CircleShapeSignature.h>
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool ShapeSignatureInferencePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
+bool ShapeSignatureInferencePass::run(loco::Graph *g)
+{
+  luci::ssinf::Rule signature_inference_rule;
+  bool changed = false;
+
+  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
+  {
+    luci::ShapeSignature shape_signature;
+
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (signature_inference_rule.infer(circle_node, shape_signature))
+    {
+      if (!(circle_node->shape_signature() == shape_signature))
+      {
+        circle_node->shape_signature(shape_signature);
+        changed = true;
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp
new file mode 100644
index 0000000..6a58f18
--- /dev/null
+++ b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <cassert>
+#include <vector>
+
+namespace
+{
+
+bool satisfy_precondition(luci::CircleFullyConnected *fc)
+{
+  // check if it's already been shuffled
+  if (fc->weights_format() != luci::CircleFullyConnected::WeightsFormat::DEFAULT)
+    return false;
+
+  // check if its data type is FLOAT32
+  if (fc->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  auto weights = loco::must_cast<luci::CircleConst *>(fc->weights());
+  // rank must be 2
+  if (weights->rank() != 2)
+    return false;
+
+  // check if it has sparsity parameter
+  if (weights->sparsityparam())
+    return false;
+
+  // check if the number of row of FullyConnected's weight is a multiple of 16
+  const uint32_t MULTIPLE = 16;
+  uint32_t rows = weights->dim(0).value();
+  if (rows % MULTIPLE)
+    return false;
+
+  return true;
+}
+
+// get FullyConnected op vector that has same tensor
+void get_FCs_having_same_tensor(std::vector<luci::CircleFullyConnected *> &fc_vec, loco::Graph *g,
+                                luci::CircleFullyConnected *fc)
+{
+  auto the_tensor = fc->weights();
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    if (fc->weights() == the_tensor)
+      fc_vec.push_back(fc);
+  }
+}
+
+luci::CircleConst *shuffle_weight(luci::CircleFullyConnected *fc)
+{
+  auto the_weights = loco::must_cast<luci::CircleConst *>(fc->weights());
+
+  // create CircleConst where shuffled data will be stored
+  luci::CircleConst *new_weights = fc->graph()->nodes()->create<luci::CircleConst>();
+  new_weights->dtype(loco::DataType::FLOAT32);
+  new_weights->size<loco::DataType::FLOAT32>(the_weights->size<loco::DataType::FLOAT32>());
+  new_weights->rank(the_weights->rank());
+  new_weights->shape_status(the_weights->shape_status());
+  for (uint32_t r = 0; r < new_weights->rank(); r++)
+  {
+    new_weights->dim(r).set(the_weights->dim(r).value());
+  }
+
+  // suffle weight
+  const uint32_t MULTIPLE = 16;
+  const uint32_t rows = the_weights->dim(0).value();
+  const uint32_t cols = the_weights->dim(1).value();
+  const uint32_t r_step = rows / MULTIPLE;
+  uint32_t index = 0;
+  for (uint32_t r = 0; r < r_step; r++)
+  {
+    for (uint32_t c = 0; c < cols; c++)
+    {
+      for (uint32_t i = 0; i < MULTIPLE; i++)
+      {
+        new_weights->at<loco::DataType::FLOAT32>(index++) =
+            the_weights->at<loco::DataType::FLOAT32>((r * MULTIPLE + i) * cols + c);
+      }
+    }
+  }
+
+  return new_weights;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ShuffleWeightTo16x1Float32Pass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    if (not satisfy_precondition(fc))
+      continue;
+
+    std::vector<luci::CircleFullyConnected *> fc_vec;
+    get_FCs_having_same_tensor(fc_vec, g, fc);
+    auto new_weights = shuffle_weight(fc);
+
+    // replace to new weights
+    for (const auto fc : fc_vec)
+    {
+      fc->weights(new_weights);
+      fc->weights_format(luci::CircleFullyConnected::WeightsFormat::SHUFFLED16x1FLOAT32);
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp
new file mode 100644
index 0000000..9745e57
--- /dev/null
+++ b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+void create_fc_net(loco::Graph *g)
+{
+  assert(g);
+
+  const uint32_t ROW = 16;
+  const uint32_t COL = 2;
+  const uint32_t elements_num = ROW * COL;
+
+  // input
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto graph_input = g->inputs()->create();
+  input->index(graph_input->index());
+
+  // fc weights
+  auto weights = g->nodes()->create<luci::CircleConst>();
+  weights->dtype(loco::DataType::FLOAT32);
+  weights->size<loco::DataType::FLOAT32>(elements_num);
+  weights->rank(2);
+  weights->dim(0).set(ROW);
+  weights->dim(1).set(COL);
+  for (uint32_t idx = 0; idx < elements_num; idx++)
+  {
+    weights->at<loco::DataType::FLOAT32>(idx) = idx;
+  }
+
+  // fc
+  auto fc = g->nodes()->create<luci::CircleFullyConnected>();
+  fc->dtype(loco::DataType::FLOAT32);
+  fc->input(input);
+  fc->weights(weights);
+
+  // output
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(fc);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+}
+
+TEST(ShuffleWeightTo16x1Float32PassTest, SimpleTest1)
+{
+  auto graph = loco::make_graph();
+  create_fc_net(graph.get());
+
+  luci::CircleFullyConnected *fc_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    fc_node = fc;
+    break;
+  }
+  ASSERT_NE(fc_node, nullptr);
+  auto weights = loco::must_cast<luci::CircleConst *>(fc_node->weights());
+  // before
+  ASSERT_EQ(0, weights->at<loco::DataType::FLOAT32>(0));
+  ASSERT_EQ(1, weights->at<loco::DataType::FLOAT32>(1));
+  ASSERT_EQ(2, weights->at<loco::DataType::FLOAT32>(2));
+  ASSERT_EQ(3, weights->at<loco::DataType::FLOAT32>(3));
+  ASSERT_EQ(4, weights->at<loco::DataType::FLOAT32>(4));
+  ASSERT_EQ(5, weights->at<loco::DataType::FLOAT32>(5));
+  ASSERT_EQ(6, weights->at<loco::DataType::FLOAT32>(6));
+  ASSERT_EQ(7, weights->at<loco::DataType::FLOAT32>(7));
+  ASSERT_EQ(8, weights->at<loco::DataType::FLOAT32>(8));
+  ASSERT_EQ(9, weights->at<loco::DataType::FLOAT32>(9));
+  ASSERT_EQ(10, weights->at<loco::DataType::FLOAT32>(10));
+  ASSERT_EQ(11, weights->at<loco::DataType::FLOAT32>(11));
+  ASSERT_EQ(12, weights->at<loco::DataType::FLOAT32>(12));
+  ASSERT_EQ(13, weights->at<loco::DataType::FLOAT32>(13));
+  ASSERT_EQ(14, weights->at<loco::DataType::FLOAT32>(14));
+  ASSERT_EQ(15, weights->at<loco::DataType::FLOAT32>(15));
+
+  luci::ShuffleWeightTo16x1Float32Pass pass;
+  while (pass.run(graph.get()))
+    ;
+
+  weights = loco::must_cast<luci::CircleConst *>(fc_node->weights());
+  // after
+  ASSERT_EQ(0, weights->at<loco::DataType::FLOAT32>(0));
+  ASSERT_EQ(2, weights->at<loco::DataType::FLOAT32>(1));
+  ASSERT_EQ(4, weights->at<loco::DataType::FLOAT32>(2));
+  ASSERT_EQ(6, weights->at<loco::DataType::FLOAT32>(3));
+  ASSERT_EQ(8, weights->at<loco::DataType::FLOAT32>(4));
+  ASSERT_EQ(10, weights->at<loco::DataType::FLOAT32>(5));
+  ASSERT_EQ(12, weights->at<loco::DataType::FLOAT32>(6));
+  ASSERT_EQ(14, weights->at<loco::DataType::FLOAT32>(7));
+  ASSERT_EQ(16, weights->at<loco::DataType::FLOAT32>(8));
+  ASSERT_EQ(18, weights->at<loco::DataType::FLOAT32>(9));
+  ASSERT_EQ(20, weights->at<loco::DataType::FLOAT32>(10));
+  ASSERT_EQ(22, weights->at<loco::DataType::FLOAT32>(11));
+  ASSERT_EQ(24, weights->at<loco::DataType::FLOAT32>(12));
+  ASSERT_EQ(26, weights->at<loco::DataType::FLOAT32>(13));
+  ASSERT_EQ(28, weights->at<loco::DataType::FLOAT32>(14));
+  ASSERT_EQ(30, weights->at<loco::DataType::FLOAT32>(15));
+}
diff --git a/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp b/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp
new file mode 100644
index 0000000..44e974b
--- /dev/null
+++ b/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SubstitutePackToReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool substitute_pack_to_reshape(luci::CircleNode *node)
+{
+  auto target_node = dynamic_cast<luci::CirclePack *>(node);
+  if (target_node == nullptr)
+    return false;
+  if (target_node->values_count() != 1)
+    return false;
+  auto value_node = loco::must_cast<luci::CircleNode *>(target_node->values(0));
+  if (value_node->shape_status() != luci::ShapeStatus::VALID)
+    return false;
+  int32_t axis = target_node->axis();
+  if (axis < 0)
+    axis = axis + static_cast<int32_t>(value_node->rank()) + 1;
+
+  auto graph = target_node->graph();
+  auto reshape_node = graph->nodes()->create<luci::CircleReshape>();
+  reshape_node->tensor(value_node);
+
+  auto const_node = graph->nodes()->create<luci::CircleConst>();
+  const_node->dtype(loco::DataType::S32);
+  const_node->size<loco::DataType::S32>(value_node->rank() + 1);
+  const_node->shape_status(luci::ShapeStatus::VALID);
+  const_node->rank(1);
+  const_node->dim(0).set(value_node->rank() + 1);
+  for (int32_t i = 0; i < static_cast<int32_t>(value_node->rank()) + 1; i++)
+  {
+    if (i == axis)
+    {
+      const_node->at<loco::DataType::S32>(i) = 1;
+    }
+    else if (i < axis)
+    {
+      const_node->at<loco::DataType::S32>(i) = value_node->dim(i).value();
+    }
+    else
+    {
+      const_node->at<loco::DataType::S32>(i) = value_node->dim(i - 1).value();
+    }
+  }
+  reshape_node->shape(const_node);
+  replace(target_node).with(reshape_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ *   BEFORE
+ *      |
+ * [CircleNode]
+ *      |
+ * [CirclePack]
+ *      |
+ * [CircleNode]
+ *      |
+ *
+ *    AFTER
+ *      |
+ * [CircleNode]  [CircleConst]
+ *       \             /
+ *       [CircleReshape]
+ *             |
+ *        [CircleNode]
+ *             |
+ *
+ */
+bool SubstitutePackToReshapePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (substitute_pack_to_reshape(circle_node))
+    {
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp b/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp
new file mode 100644
index 0000000..143b888
--- /dev/null
+++ b/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/SubstitutePackToReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *           BEFORE
+ *             |
+ *        [CircleNode]
+ *             |
+ *        [CirclePack]
+ *             |
+ *        [CircleNode]
+ *             |
+ *
+ *           AFTER
+ *      |
+ * [CircleNode]  [CircleConst]
+ *       \             /
+ *       [CircleReshape]
+ *             |
+ *        [CircleNode]
+ *             |
+ *
+ */
+void create_substitute_pack_to_reshape(loco::Graph *g, const std::initializer_list<uint32_t> shape,
+                                       int32_t axis)
+{
+  assert(g);
+
+  // Input Create.
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto graph_input = g->inputs()->create();
+  input->index(graph_input->index());
+  input->shape_status(luci::ShapeStatus::VALID);
+  input->rank(shape.size());
+  input->shape(shape);
+
+  // Pack Node create.
+  auto pack = g->nodes()->create<luci::CirclePack>(1);
+  pack->values(0, input);
+  pack->axis(axis);
+
+  // Output Connect.
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(pack);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+
+  return;
+}
+
+} // namespace
+
+TEST(SubstitutePackToReshapePass, simple_case)
+{
+  auto graph = loco::make_graph();
+  create_substitute_pack_to_reshape(graph.get(), {1, 2, 3, 4}, 0);
+  luci::SubstitutePackToReshapePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleReshape *reshape_node = nullptr;
+  luci::CirclePack *pack_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+      reshape_node = reshape;
+    else if (auto pack = dynamic_cast<luci::CirclePack *>(node))
+      pack_node = pack;
+  }
+  ASSERT_NE(nullptr, reshape_node);
+  ASSERT_EQ(nullptr, pack_node);
+  auto new_shape = loco::must_cast<luci::CircleConst *>(reshape_node->shape());
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(0));
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(1));
+  ASSERT_EQ(2, new_shape->at<loco::DataType::S32>(2));
+  ASSERT_EQ(3, new_shape->at<loco::DataType::S32>(3));
+  ASSERT_EQ(4, new_shape->at<loco::DataType::S32>(4));
+}
+
+TEST(SubstitutePackToReshapePass, simple_case_neg_axis)
+{
+  auto graph = loco::make_graph();
+  create_substitute_pack_to_reshape(graph.get(), {1, 2, 3, 4}, -1);
+  luci::SubstitutePackToReshapePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleReshape *reshape_node = nullptr;
+  luci::CirclePack *pack_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+      reshape_node = reshape;
+    else if (auto pack = dynamic_cast<luci::CirclePack *>(node))
+      pack_node = pack;
+  }
+  ASSERT_NE(nullptr, reshape_node);
+  ASSERT_EQ(nullptr, pack_node);
+  auto new_shape = loco::must_cast<luci::CircleConst *>(reshape_node->shape());
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(0));
+  ASSERT_EQ(2, new_shape->at<loco::DataType::S32>(1));
+  ASSERT_EQ(3, new_shape->at<loco::DataType::S32>(2));
+  ASSERT_EQ(4, new_shape->at<loco::DataType::S32>(3));
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(4));
+}
diff --git a/compiler/luci/pass/src/TypeInferencePass.cpp b/compiler/luci/pass/src/TypeInferencePass.cpp
index 2c7b3a8..6374404 100644
--- a/compiler/luci/pass/src/TypeInferencePass.cpp
+++ b/compiler/luci/pass/src/TypeInferencePass.cpp
@@ -26,6 +26,19 @@
 namespace luci
 {
 
+bool TypeInferencePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
 bool TypeInferencePass::run(loco::Graph *g)
 {
   loco::CanonicalTypeInferenceRule canonical_rule;
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInference.h b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
index fb934c2..c301db5 100644
--- a/compiler/luci/service/include/luci/Service/CircleShapeInference.h
+++ b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
@@ -21,6 +21,10 @@
 
 #include <loco/IR/Nodes.h>
 
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Service/CircleShapeInferenceHelper.h>
+
 namespace luci
 {
 
@@ -36,6 +40,155 @@ struct ShapeInference
   static ShapeDescription get(loco::Node *node);
 };
 
+namespace sinf // namespace for Shape Inference
+{
+
+struct Rule
+{
+  bool infer(const luci::CircleNode *, loco::TensorShape &) const;
+};
+
+class Algorithm final : public luci::CircleNodeVisitor<loco::TensorShape>
+{
+public:
+  // TODO Remove this when all of visit function is implemented
+  loco::TensorShape visit(const luci::CircleNode *node) final { return sinf::circle_shape(node); }
+
+  // loco::TensorShape visit(const luci::CircleAbs *node) final;
+  // loco::TensorShape visit(const luci::CircleAdd *node) final;
+  // loco::TensorShape visit(const luci::CircleAddN *node) final;
+  // loco::TensorShape visit(const luci::CircleArgMax *node) final;
+  // loco::TensorShape visit(const luci::CircleArgMin *node) final;
+  // loco::TensorShape visit(const luci::CircleAveragePool2D *node) final;
+  // loco::TensorShape visit(const luci::CircleBatchMatMul *node) final;
+  // loco::TensorShape visit(const luci::CircleBatchToSpaceND *node) final;
+  // loco::TensorShape visit(const luci::CircleCast *node) final;
+  // loco::TensorShape visit(const luci::CircleCeil *node) final;
+  // loco::TensorShape visit(const luci::CircleConcatenation *node) final;
+  // loco::TensorShape visit(const luci::CircleConst *node) final;
+  // loco::TensorShape visit(const luci::CircleConv2D *node) final;
+  // loco::TensorShape visit(const luci::CircleCos *node) final;
+  // loco::TensorShape visit(const luci::CircleCustom *node) final;
+  // loco::TensorShape visit(const luci::CircleDepthToSpace *node) final;
+  // loco::TensorShape visit(const luci::CircleDepthwiseConv2D *node) final;
+  // loco::TensorShape visit(const luci::CircleDequantize *node) final;
+  // loco::TensorShape visit(const luci::CircleDiv *node) final;
+  // loco::TensorShape visit(const luci::CircleElu *node) final;
+  // loco::TensorShape visit(const luci::CircleEqual *node) final;
+  // loco::TensorShape visit(const luci::CircleExp *node) final;
+  // loco::TensorShape visit(const luci::CircleExpandDims *node) final;
+  // loco::TensorShape visit(const luci::CircleFill *node) final;
+  // loco::TensorShape visit(const luci::CircleFloor *node) final;
+  // loco::TensorShape visit(const luci::CircleFloorDiv *node) final;
+  // loco::TensorShape visit(const luci::CircleFloorMod *node) final;
+  // loco::TensorShape visit(const luci::CircleFullyConnected *node) final;
+  // loco::TensorShape visit(const luci::CircleGather *node) final;
+  // loco::TensorShape visit(const luci::CircleGatherNd *node) final;
+  // loco::TensorShape visit(const luci::CircleGreater *node) final;
+  // loco::TensorShape visit(const luci::CircleGreaterEqual *node) final;
+  // loco::TensorShape visit(const luci::CircleIf *node) final;
+  // loco::TensorShape visit(const luci::CircleL2Normalize *node) final;
+  // loco::TensorShape visit(const luci::CircleL2Pool2D *node) final;
+  // loco::TensorShape visit(const luci::CircleLeakyRelu *node) final;
+  // loco::TensorShape visit(const luci::CircleLess *node) final;
+  // loco::TensorShape visit(const luci::CircleLessEqual *node) final;
+  // loco::TensorShape visit(const luci::CircleLocalResponseNormalization *node) final;
+  // loco::TensorShape visit(const luci::CircleLog *node) final;
+  // loco::TensorShape visit(const luci::CircleLogicalAnd *node) final;
+  // loco::TensorShape visit(const luci::CircleLogicalNot *node) final;
+  // loco::TensorShape visit(const luci::CircleLogicalOr *node) final;
+  // loco::TensorShape visit(const luci::CircleLogistic *node) final;
+  // loco::TensorShape visit(const luci::CircleLogSoftmax *node) final;
+  // loco::TensorShape visit(const luci::CircleMatrixDiag *node) final;
+  // loco::TensorShape visit(const luci::CircleMatrixSetDiag *node) final;
+  // loco::TensorShape visit(const luci::CircleMaximum *node) final;
+  // loco::TensorShape visit(const luci::CircleMaxPool2D *node) final;
+  // loco::TensorShape visit(const luci::CircleMean *node) final;
+  // loco::TensorShape visit(const luci::CircleMinimum *node) final;
+  // loco::TensorShape visit(const luci::CircleMirrorPad *node) final;
+  // loco::TensorShape visit(const luci::CircleNeg *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV4 *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV5 *node) final;
+  // loco::TensorShape visit(const luci::CircleNotEqual *node) final;
+  // loco::TensorShape visit(const luci::CirclePack *node) final;
+  // loco::TensorShape visit(const luci::CirclePad *node) final;
+  // loco::TensorShape visit(const luci::CirclePadV2 *node) final;
+  // loco::TensorShape visit(const luci::CirclePow *node) final;
+  // loco::TensorShape visit(const luci::CirclePRelu *node) final;
+  // loco::TensorShape visit(const luci::CircleRange *node) final;
+  // loco::TensorShape visit(const luci::CircleRank *node) final;
+  // loco::TensorShape visit(const luci::CircleMul *node) final;
+  // loco::TensorShape visit(const luci::CircleOneHot *node) final;
+  // loco::TensorShape visit(const luci::CircleReduceAny *node) final;
+  // loco::TensorShape visit(const luci::CircleReduceMax *node) final;
+  // loco::TensorShape visit(const luci::CircleReduceMin *node) final;
+  // loco::TensorShape visit(const luci::CircleReduceProd *node) final;
+  // loco::TensorShape visit(const luci::CircleRelu *node) final;
+  // loco::TensorShape visit(const luci::CircleRelu6 *node) final;
+  // loco::TensorShape visit(const luci::CircleReluN1To1 *node) final;
+  // loco::TensorShape visit(const luci::CircleReshape *node) final;
+  // loco::TensorShape visit(const luci::CircleResizeBilinear *node) final;
+  // loco::TensorShape visit(const luci::CircleResizeNearestNeighbor *node) final;
+  // loco::TensorShape visit(const luci::CircleReverseSequence *node) final;
+  // loco::TensorShape visit(const luci::CircleReverseV2 *node) final;
+  // loco::TensorShape visit(const luci::CircleRound *node) final;
+  // loco::TensorShape visit(const luci::CircleRsqrt *node) final;
+  // loco::TensorShape visit(const luci::CircleScatterNd *node) final;
+  // loco::TensorShape visit(const luci::CircleSegmentSum *node) final;
+  // loco::TensorShape visit(const luci::CircleSelect *node) final;
+  // loco::TensorShape visit(const luci::CircleSelectV2 *node) final;
+  // loco::TensorShape visit(const luci::CircleShape *node) final;
+  // loco::TensorShape visit(const luci::CircleSin *node) final;
+  // loco::TensorShape visit(const luci::CircleSlice *node) final;
+  // loco::TensorShape visit(const luci::CircleSoftmax *node) final;
+  // loco::TensorShape visit(const luci::CircleSpaceToBatchND *node) final;
+  // loco::TensorShape visit(const luci::CircleSpaceToDepth *node) final;
+  // loco::TensorShape visit(const luci::CircleSparseToDense *node) final;
+  // loco::TensorShape visit(const luci::CircleSplit *node) final;
+  // loco::TensorShape visit(const luci::CircleSplitV *node) final;
+  // loco::TensorShape visit(const luci::CircleSqrt *node) final;
+  // loco::TensorShape visit(const luci::CircleSquare *node) final;
+  // loco::TensorShape visit(const luci::CircleSquaredDifference *node) final;
+  // loco::TensorShape visit(const luci::CircleSqueeze *node) final;
+  // loco::TensorShape visit(const luci::CircleStridedSlice *node) final;
+  // loco::TensorShape visit(const luci::CircleSub *node) final;
+  // loco::TensorShape visit(const luci::CircleSum *node) final;
+  // loco::TensorShape visit(const luci::CircleTanh *node) final;
+  // loco::TensorShape visit(const luci::CircleTile *node) final;
+  // loco::TensorShape visit(const luci::CircleTopKV2 *node) final;
+  // loco::TensorShape visit(const luci::CircleTranspose *node) final;
+  // loco::TensorShape visit(const luci::CircleTransposeConv *node) final;
+  // loco::TensorShape visit(const luci::CircleUnidirectionalSequenceLSTM *node) final;
+  // loco::TensorShape visit(const luci::CircleUnique *node) final;
+  // loco::TensorShape visit(const luci::CircleUnpack *node) final;
+  // loco::TensorShape visit(const luci::CircleWhere *node) final;
+  // loco::TensorShape visit(const luci::CircleWhile *node) final;
+  // loco::TensorShape visit(const luci::CircleZerosLike *node) final;
+
+  // Circle Only
+  // loco::TensorShape visit(const luci::CircleBCQFullyConnected *node) final;
+  // loco::TensorShape visit(const luci::CircleBCQGather *node) final;
+  // loco::TensorShape visit(const luci::CircleInstanceNorm *node) final;
+
+  // Virtual
+  // loco::TensorShape visit(const luci::CircleInput *node) final;
+  // loco::TensorShape visit(const luci::CircleOutput *node) final;
+  // loco::TensorShape visit(const luci::CircleOutputDummy *node) final;
+  // loco::TensorShape visit(const luci::CircleOutputExclude *node) final;
+  // loco::TensorShape visit(const luci::CircleCustomOut *node) final;
+  // loco::TensorShape visit(const luci::CircleIfOut *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
+  // loco::TensorShape visit(const luci::CircleSplitOut *node) final;
+  // loco::TensorShape visit(const luci::CircleSplitVOut *node) final;
+  // loco::TensorShape visit(const luci::CircleTopKV2Out *node) final;
+  // loco::TensorShape visit(const luci::CircleUniqueOut *node) final;
+  // loco::TensorShape visit(const luci::CircleUnpackOut *node) final;
+  // loco::TensorShape visit(const luci::CircleWhileOut *node) final;
+};
+
+} // namespace sinf
+
 } // namespace luci
 
 #endif // __LUCI_CIRCLE_SHAPE_INFERENCE_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInferenceHelper.h b/compiler/luci/service/include/luci/Service/CircleShapeInferenceHelper.h
new file mode 100644
index 0000000..dd6a5a4
--- /dev/null
+++ b/compiler/luci/service/include/luci/Service/CircleShapeInferenceHelper.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_SHAPE_INFERENCE_HELPER_H__
+#define __LUCI_CIRCLE_SHAPE_INFERENCE_HELPER_H__
+
+#include <loco/IR/TensorShape.h>
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleShapeSignature.h>
+
+namespace luci
+{
+namespace sinf // Namespace for Shape Inference
+{
+
+// Return shape of circle node as loco::TensorShape
+loco::TensorShape circle_shape(const luci::CircleNode *node);
+
+} // namespace sinf
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_SHAPE_INFERENCE_HELPER_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceRule.h b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h
similarity index 87%
rename from compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceRule.h
rename to compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h
index 4d1d830..f7ea89b 100644
--- a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceRule.h
+++ b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h
@@ -14,22 +14,26 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_RULE_H__
-#define __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_RULE_H__
+#ifndef __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
+#define __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
 #include <luci/IR/CircleShapeSignature.h>
+#include <luci/Service/CircleShapeSignatureInferenceHelper.h>
 
 namespace luci
 {
 
-struct CircleShapeSignatureInferenceRule
+namespace ssinf // namespace for Shape Signature Inference
+{
+
+struct Rule
 {
   bool infer(const luci::CircleNode *, ShapeSignature &) const;
 };
 
-class ShapeSignatureInferenceAlgorithm final : public luci::CircleNodeVisitor<ShapeSignature>
+class Algorithm final : public luci::CircleNodeVisitor<ShapeSignature>
 {
 public:
   // TODO Remove this when visit function is implemented for all the operations.
@@ -84,7 +88,7 @@ public:
   // ShapeSignature visit(const luci::CircleMatrixSetDiag *node) final;
   // ShapeSignature visit(const luci::CircleMaximum *node) final;
   // ShapeSignature visit(const luci::CircleMaxPool2D *node) final;
-  // ShapeSignature visit(const luci::CircleMean *node) final;
+  ShapeSignature visit(const luci::CircleMean *node) final;
   // ShapeSignature visit(const luci::CircleMinimum *node) final;
   // ShapeSignature visit(const luci::CircleMirrorPad *node) final;
   // ShapeSignature visit(const luci::CircleNeg *node) final;
@@ -100,13 +104,13 @@ public:
   // ShapeSignature visit(const luci::CircleRank *node) final;
   // ShapeSignature visit(const luci::CircleMul *node) final;
   // ShapeSignature visit(const luci::CircleOneHot *node) final;
-  // ShapeSignature visit(const luci::CircleReduceAny *node) final;
-  // ShapeSignature visit(const luci::CircleReduceMax *node) final;
-  // ShapeSignature visit(const luci::CircleReduceMin *node) final;
-  // ShapeSignature visit(const luci::CircleReduceProd *node) final;
-  // ShapeSignature visit(const luci::CircleRelu *node) final;
-  // ShapeSignature visit(const luci::CircleRelu6 *node) final;
-  // ShapeSignature visit(const luci::CircleReluN1To1 *node) final;
+  ShapeSignature visit(const luci::CircleReduceAny *node) final;
+  ShapeSignature visit(const luci::CircleReduceMax *node) final;
+  ShapeSignature visit(const luci::CircleReduceMin *node) final;
+  ShapeSignature visit(const luci::CircleReduceProd *node) final;
+  ShapeSignature visit(const luci::CircleRelu *node) final;
+  ShapeSignature visit(const luci::CircleRelu6 *node) final;
+  ShapeSignature visit(const luci::CircleReluN1To1 *node) final;
   // ShapeSignature visit(const luci::CircleReshape *node) final;
   // ShapeSignature visit(const luci::CircleResizeBilinear *node) final;
   // ShapeSignature visit(const luci::CircleResizeNearestNeighbor *node) final;
@@ -133,7 +137,7 @@ public:
   // ShapeSignature visit(const luci::CircleSqueeze *node) final;
   // ShapeSignature visit(const luci::CircleStridedSlice *node) final;
   // ShapeSignature visit(const luci::CircleSub *node) final;
-  // ShapeSignature visit(const luci::CircleSum *node) final;
+  ShapeSignature visit(const luci::CircleSum *node) final;
   // ShapeSignature visit(const luci::CircleTanh *node) final;
   // ShapeSignature visit(const luci::CircleTile *node) final;
   // ShapeSignature visit(const luci::CircleTopKV2 *node) final;
@@ -152,10 +156,10 @@ public:
   // ShapeSignature visit(const luci::CircleInstanceNorm *node) final;
 
   // Virtual
-  // ShapeSignature visit(const luci::CircleInput *node) final;
-  // ShapeSignature visit(const luci::CircleOutput *node) final;
-  // ShapeSignature visit(const luci::CircleOutputDummy *node) final;
-  // ShapeSignature visit(const luci::CircleOutputExclude *node) final;
+  ShapeSignature visit(const luci::CircleInput *node) final;
+  ShapeSignature visit(const luci::CircleOutput *node) final;
+  ShapeSignature visit(const luci::CircleOutputDummy *node) final;
+  ShapeSignature visit(const luci::CircleOutputExclude *node) final;
   // ShapeSignature visit(const luci::CircleCustomOut *node) final;
   // ShapeSignature visit(const luci::CircleIfOut *node) final;
   // ShapeSignature visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
@@ -168,6 +172,8 @@ public:
   // ShapeSignature visit(const luci::CircleWhileOut *node) final;
 };
 
+} // namespace ssinf
+
 } // namespace luci
 
-#endif // __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_RULE_H__
+#endif // __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h
new file mode 100644
index 0000000..fb5b3b3
--- /dev/null
+++ b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
+#define __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleShapeSignature.h>
+
+namespace luci
+{
+
+namespace ssinf // Namespace for Shape Signature Inference
+{
+
+// Return empty signature if all of dimensions are known.
+// If at least one of dimensions is unknown, return signature without change.
+ShapeSignature legalized_signature(const luci::ShapeSignature &signature);
+
+// Return reduced input_signature with indices and keep_dims.
+//  - indices : reduction index
+//  - keep_dims : If true, rank is not changed. If false, rank is reduced along indices.
+ShapeSignature reduced_signature(const loco::Node *node, const loco::Node *indices, bool keep_dims);
+
+// Return signature of index-th argument of node.
+ShapeSignature input_arg_signature(const luci::CircleNode *node, uint32_t index);
+
+} // namespace ssinf
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleTypeInference.h b/compiler/luci/service/include/luci/Service/CircleTypeInference.h
index ea7a3c5..3422148 100644
--- a/compiler/luci/service/include/luci/Service/CircleTypeInference.h
+++ b/compiler/luci/service/include/luci/Service/CircleTypeInference.h
@@ -21,6 +21,10 @@
 
 #include <mio/circle/schema_generated.h>
 
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Service/CircleTypeInferenceHelper.h>
+
 namespace luci
 {
 
@@ -37,6 +41,155 @@ struct TypeInference
   static circle::TensorType get(loco::Node *node);
 };
 
+namespace tinf // namespace for Type Inference
+{
+
+struct Rule
+{
+  bool infer(const luci::CircleNode *, loco::DataType &) const;
+};
+
+class Algorithm final : public luci::CircleNodeVisitor<loco::DataType>
+{
+public:
+  // TODO Remove this when all of visit function is implemented
+  loco::DataType visit(const luci::CircleNode *node) final { return node->dtype(); }
+
+  // loco::DataType visit(const luci::CircleAbs *node) final;
+  // loco::DataType visit(const luci::CircleAdd *node) final;
+  // loco::DataType visit(const luci::CircleAddN *node) final;
+  // loco::DataType visit(const luci::CircleArgMax *node) final;
+  // loco::DataType visit(const luci::CircleArgMin *node) final;
+  // loco::DataType visit(const luci::CircleAveragePool2D *node) final;
+  // loco::DataType visit(const luci::CircleBatchMatMul *node) final;
+  // loco::DataType visit(const luci::CircleBatchToSpaceND *node) final;
+  // loco::DataType visit(const luci::CircleCast *node) final;
+  // loco::DataType visit(const luci::CircleCeil *node) final;
+  // loco::DataType visit(const luci::CircleConcatenation *node) final;
+  // loco::DataType visit(const luci::CircleConst *node) final;
+  // loco::DataType visit(const luci::CircleConv2D *node) final;
+  // loco::DataType visit(const luci::CircleCos *node) final;
+  // loco::DataType visit(const luci::CircleCustom *node) final;
+  // loco::DataType visit(const luci::CircleDepthToSpace *node) final;
+  // loco::DataType visit(const luci::CircleDepthwiseConv2D *node) final;
+  // loco::DataType visit(const luci::CircleDequantize *node) final;
+  // loco::DataType visit(const luci::CircleDiv *node) final;
+  // loco::DataType visit(const luci::CircleElu *node) final;
+  // loco::DataType visit(const luci::CircleEqual *node) final;
+  // loco::DataType visit(const luci::CircleExp *node) final;
+  // loco::DataType visit(const luci::CircleExpandDims *node) final;
+  // loco::DataType visit(const luci::CircleFill *node) final;
+  // loco::DataType visit(const luci::CircleFloor *node) final;
+  // loco::DataType visit(const luci::CircleFloorDiv *node) final;
+  // loco::DataType visit(const luci::CircleFloorMod *node) final;
+  // loco::DataType visit(const luci::CircleFullyConnected *node) final;
+  // loco::DataType visit(const luci::CircleGather *node) final;
+  // loco::DataType visit(const luci::CircleGatherNd *node) final;
+  // loco::DataType visit(const luci::CircleGreater *node) final;
+  // loco::DataType visit(const luci::CircleGreaterEqual *node) final;
+  // loco::DataType visit(const luci::CircleIf *node) final;
+  // loco::DataType visit(const luci::CircleL2Normalize *node) final;
+  // loco::DataType visit(const luci::CircleL2Pool2D *node) final;
+  // loco::DataType visit(const luci::CircleLeakyRelu *node) final;
+  // loco::DataType visit(const luci::CircleLess *node) final;
+  // loco::DataType visit(const luci::CircleLessEqual *node) final;
+  // loco::DataType visit(const luci::CircleLocalResponseNormalization *node) final;
+  // loco::DataType visit(const luci::CircleLog *node) final;
+  // loco::DataType visit(const luci::CircleLogicalAnd *node) final;
+  // loco::DataType visit(const luci::CircleLogicalNot *node) final;
+  // loco::DataType visit(const luci::CircleLogicalOr *node) final;
+  // loco::DataType visit(const luci::CircleLogistic *node) final;
+  // loco::DataType visit(const luci::CircleLogSoftmax *node) final;
+  // loco::DataType visit(const luci::CircleMatrixDiag *node) final;
+  // loco::DataType visit(const luci::CircleMatrixSetDiag *node) final;
+  // loco::DataType visit(const luci::CircleMaximum *node) final;
+  // loco::DataType visit(const luci::CircleMaxPool2D *node) final;
+  // loco::DataType visit(const luci::CircleMean *node) final;
+  // loco::DataType visit(const luci::CircleMinimum *node) final;
+  // loco::DataType visit(const luci::CircleMirrorPad *node) final;
+  // loco::DataType visit(const luci::CircleNeg *node) final;
+  // loco::DataType visit(const luci::CircleNonMaxSuppressionV4 *node) final;
+  // loco::DataType visit(const luci::CircleNonMaxSuppressionV5 *node) final;
+  // loco::DataType visit(const luci::CircleNotEqual *node) final;
+  // loco::DataType visit(const luci::CirclePack *node) final;
+  // loco::DataType visit(const luci::CirclePad *node) final;
+  // loco::DataType visit(const luci::CirclePadV2 *node) final;
+  // loco::DataType visit(const luci::CirclePow *node) final;
+  // loco::DataType visit(const luci::CirclePRelu *node) final;
+  // loco::DataType visit(const luci::CircleRange *node) final;
+  // loco::DataType visit(const luci::CircleRank *node) final;
+  // loco::DataType visit(const luci::CircleMul *node) final;
+  // loco::DataType visit(const luci::CircleOneHot *node) final;
+  // loco::DataType visit(const luci::CircleReduceAny *node) final;
+  // loco::DataType visit(const luci::CircleReduceMax *node) final;
+  // loco::DataType visit(const luci::CircleReduceMin *node) final;
+  // loco::DataType visit(const luci::CircleReduceProd *node) final;
+  // loco::DataType visit(const luci::CircleRelu *node) final;
+  // loco::DataType visit(const luci::CircleRelu6 *node) final;
+  // loco::DataType visit(const luci::CircleReluN1To1 *node) final;
+  // loco::DataType visit(const luci::CircleReshape *node) final;
+  // loco::DataType visit(const luci::CircleResizeBilinear *node) final;
+  // loco::DataType visit(const luci::CircleResizeNearestNeighbor *node) final;
+  // loco::DataType visit(const luci::CircleReverseSequence *node) final;
+  // loco::DataType visit(const luci::CircleReverseV2 *node) final;
+  // loco::DataType visit(const luci::CircleRound *node) final;
+  // loco::DataType visit(const luci::CircleRsqrt *node) final;
+  // loco::DataType visit(const luci::CircleScatterNd *node) final;
+  // loco::DataType visit(const luci::CircleSegmentSum *node) final;
+  // loco::DataType visit(const luci::CircleSelect *node) final;
+  // loco::DataType visit(const luci::CircleSelectV2 *node) final;
+  // loco::DataType visit(const luci::CircleShape *node) final;
+  // loco::DataType visit(const luci::CircleSin *node) final;
+  // loco::DataType visit(const luci::CircleSlice *node) final;
+  // loco::DataType visit(const luci::CircleSoftmax *node) final;
+  // loco::DataType visit(const luci::CircleSpaceToBatchND *node) final;
+  // loco::DataType visit(const luci::CircleSpaceToDepth *node) final;
+  // loco::DataType visit(const luci::CircleSparseToDense *node) final;
+  // loco::DataType visit(const luci::CircleSplit *node) final;
+  // loco::DataType visit(const luci::CircleSplitV *node) final;
+  // loco::DataType visit(const luci::CircleSqrt *node) final;
+  // loco::DataType visit(const luci::CircleSquare *node) final;
+  // loco::DataType visit(const luci::CircleSquaredDifference *node) final;
+  // loco::DataType visit(const luci::CircleSqueeze *node) final;
+  // loco::DataType visit(const luci::CircleStridedSlice *node) final;
+  // loco::DataType visit(const luci::CircleSub *node) final;
+  // loco::DataType visit(const luci::CircleSum *node) final;
+  // loco::DataType visit(const luci::CircleTanh *node) final;
+  // loco::DataType visit(const luci::CircleTile *node) final;
+  // loco::DataType visit(const luci::CircleTopKV2 *node) final;
+  // loco::DataType visit(const luci::CircleTranspose *node) final;
+  // loco::DataType visit(const luci::CircleTransposeConv *node) final;
+  // loco::DataType visit(const luci::CircleUnidirectionalSequenceLSTM *node) final;
+  // loco::DataType visit(const luci::CircleUnique *node) final;
+  // loco::DataType visit(const luci::CircleUnpack *node) final;
+  // loco::DataType visit(const luci::CircleWhere *node) final;
+  // loco::DataType visit(const luci::CircleWhile *node) final;
+  // loco::DataType visit(const luci::CircleZerosLike *node) final;
+
+  // Circle Only
+  // loco::DataType visit(const luci::CircleBCQFullyConnected *node) final;
+  // loco::DataType visit(const luci::CircleBCQGather *node) final;
+  // loco::DataType visit(const luci::CircleInstanceNorm *node) final;
+
+  // Virtual
+  // loco::DataType visit(const luci::CircleInput *node) final;
+  // loco::DataType visit(const luci::CircleOutput *node) final;
+  // loco::DataType visit(const luci::CircleOutputDummy *node) final;
+  // loco::DataType visit(const luci::CircleOutputExclude *node) final;
+  // loco::DataType visit(const luci::CircleCustomOut *node) final;
+  // loco::DataType visit(const luci::CircleIfOut *node) final;
+  // loco::DataType visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
+  // loco::DataType visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
+  // loco::DataType visit(const luci::CircleSplitOut *node) final;
+  // loco::DataType visit(const luci::CircleSplitVOut *node) final;
+  // loco::DataType visit(const luci::CircleTopKV2Out *node) final;
+  // loco::DataType visit(const luci::CircleUniqueOut *node) final;
+  // loco::DataType visit(const luci::CircleUnpackOut *node) final;
+  // loco::DataType visit(const luci::CircleWhileOut *node) final;
+};
+
+} // namespace tinf
+
 } // namespace luci
 
 #endif // __LUCI_CIRCLE_TYPE_INFERENCE_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleTypeInferenceHelper.h b/compiler/luci/service/include/luci/Service/CircleTypeInferenceHelper.h
new file mode 100644
index 0000000..296f993
--- /dev/null
+++ b/compiler/luci/service/include/luci/Service/CircleTypeInferenceHelper.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_TYPE_INFERENCE_HELPER_H__
+#define __LUCI_CIRCLE_TYPE_INFERENCE_HELPER_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco/IR/DataType.h>
+
+namespace luci
+{
+namespace tinf // Namespace for Type Inference
+{
+
+// Helper function will be added
+
+} // namespace tinf
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_TYPE_INFERENCE_HELPER_H__
diff --git a/compiler/luci/service/include/luci/Service/ShapeDescription.h b/compiler/luci/service/include/luci/Service/ShapeDescription.h
index 949cce5..4d92be1 100644
--- a/compiler/luci/service/include/luci/Service/ShapeDescription.h
+++ b/compiler/luci/service/include/luci/Service/ShapeDescription.h
@@ -20,6 +20,8 @@
 #include <loco/IR/PermutingCodec.h>
 #include <loco/IR/NodeShape.h>
 
+#include <luci/IR/CircleNodes.h>
+
 #include <cstdint>
 #include <vector>
 
@@ -33,6 +35,7 @@ struct ShapeDescription
 };
 
 // TODO remove these when CircleDialect is fully functioal
+ShapeDescription to_shape_description(const luci::CircleNode *node);
 ShapeDescription to_shape_description(const loco::TensorShape &shape);
 ShapeDescription to_shape_description(const loco::FeatureShape &shape);
 ShapeDescription to_shape_description(const loco::FilterShape &shape);
diff --git a/compiler/luci/service/src/CircleShapeInference.cpp b/compiler/luci/service/src/CircleShapeInference.cpp
index 0732849..db8ffd8 100644
--- a/compiler/luci/service/src/CircleShapeInference.cpp
+++ b/compiler/luci/service/src/CircleShapeInference.cpp
@@ -20,7 +20,10 @@
 #include <loco.h>
 #include <loco/Service/ShapeInference.h>
 
+#include <luci/Log.h>
+
 #include <cassert>
+#include <iostream>
 
 namespace luci
 {
@@ -32,3 +35,60 @@ ShapeDescription ShapeInference::get(loco::Node *node)
 }
 
 } // namespace luci
+
+namespace
+{
+
+std::ostream &operator<<(std::ostream &os, const loco::TensorShape &tensor_shape)
+{
+  os << "[";
+  for (uint32_t r = 0; r < tensor_shape.rank(); ++r)
+  {
+    if (r)
+      os << ",";
+    os << tensor_shape.dim(r).value();
+  }
+  os << "]";
+  return os;
+}
+
+bool inputs_shape_ready(const luci::CircleNode *node)
+{
+  for (uint32_t arity = 0; arity < node->arity(); ++arity)
+  {
+    auto node_input = loco::must_cast<luci::CircleNode *>(node->arg(arity));
+    if (node_input->shape_status() == luci::ShapeStatus::UNDEFINED)
+      return false;
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+namespace sinf
+{
+
+bool Rule::infer(const luci::CircleNode *circle_node, loco::TensorShape &shape) const
+{
+  LOGGER(l);
+  VERBOSE(l, 1) << "[CircleShapeInference] " << circle_node->name();
+  VERBOSE(l, 1) << "  before: " << circle_shape(circle_node);
+
+  if (!inputs_shape_ready(circle_node))
+  {
+    VERBOSE(l, 1) << " after: Some inputs are not ready for inference";
+    return false;
+  }
+
+  Algorithm alg;
+  shape = circle_node->accept(&alg);
+  VERBOSE(l, 1) << " after: " << shape;
+
+  return true;
+}
+
+} // namespace ssinf
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
new file mode 100644
index 0000000..f7eb6c3
--- /dev/null
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleShapeInferenceHelper.h"
+
+namespace luci
+{
+namespace sinf
+{
+
+loco::TensorShape circle_shape(const luci::CircleNode *node)
+{
+  loco::TensorShape shape;
+  shape.rank(node->rank());
+  for (uint32_t r = 0; r < node->rank(); ++r)
+    shape.dim(r) = loco::Dimension(node->dim(r).value());
+  return shape;
+}
+
+} // namespace sinf
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index a55f50b..38ff619 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -102,7 +102,7 @@ private:
 };
 
 /**
- * @breif  Expand shape x and y to same rank by align right and filling with 1
+ * @brief  Expand shape x and y to same rank by align right and filling with 1
  */
 void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 {
@@ -122,7 +122,7 @@ void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 }
 
 /**
- * @breif  Returns shape of expanded dimension of input x and y having same rank
+ * @brief  Returns shape of expanded dimension of input x and y having same rank
  */
 loco::TensorShape expand_dimension(const loco::TensorShape &x, const loco::TensorShape &y)
 {
diff --git a/compiler/luci/service/src/CircleShapeSignatureInferenceRule.cpp b/compiler/luci/service/src/CircleShapeSignatureInference.cpp
similarity index 83%
rename from compiler/luci/service/src/CircleShapeSignatureInferenceRule.cpp
rename to compiler/luci/service/src/CircleShapeSignatureInference.cpp
index dc7df3e..1ccaa19 100644
--- a/compiler/luci/service/src/CircleShapeSignatureInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeSignatureInference.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "luci/Service/CircleShapeSignatureInferenceRule.h"
+#include "luci/Service/CircleShapeSignatureInference.h"
 
 #include <luci/Log.h>
 
@@ -39,14 +39,16 @@ std::ostream &operator<<(std::ostream &os, const luci::ShapeSignature &shape_sig
 namespace luci
 {
 
-bool CircleShapeSignatureInferenceRule::infer(const luci::CircleNode *circle_node,
-                                              ShapeSignature &shape_signature) const
+namespace ssinf
+{
+
+bool Rule::infer(const luci::CircleNode *circle_node, ShapeSignature &shape_signature) const
 {
   LOGGER(l);
 
   // There is nothing to check before ShapeSignatureInference.
 
-  ShapeSignatureInferenceAlgorithm alg;
+  Algorithm alg;
 
   shape_signature = circle_node->accept(&alg);
 
@@ -57,4 +59,6 @@ bool CircleShapeSignatureInferenceRule::infer(const luci::CircleNode *circle_nod
   return true;
 }
 
+} // namespace ssinf
+
 } // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp
new file mode 100644
index 0000000..d7d1a24
--- /dev/null
+++ b/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleShapeSignatureInferenceHelper.h"
+
+#include <loco.h>
+
+#include <luci/Log.h>
+
+#include <oops/InternalExn.h>
+
+namespace luci
+{
+
+namespace ssinf
+{
+
+luci::ShapeSignature legalized_signature(const luci::ShapeSignature &signature)
+{
+  // If shape signature has at least one -1, it is not static.
+  for (uint32_t i = 0; i < signature.rank(); ++i)
+    if (signature.dim(i) == -1)
+      return signature;
+
+  // If all dimensions are static, return empty shape signature.
+  return luci::ShapeSignature();
+}
+
+ShapeSignature reduced_signature(const loco::Node *node, const loco::Node *indices, bool keep_dims)
+{
+  LOGGER(l);
+
+  ShapeSignature input_signature;
+  ShapeSignature output_signature;
+
+  auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+  if (circle_node->shape_signature().rank() > 0)
+    input_signature = circle_node->shape_signature();
+  else
+  {
+    input_signature.rank(circle_node->rank());
+    for (uint32_t i = 0; i < circle_node->rank(); ++i)
+      input_signature.dim(i) = circle_node->dim(i).value();
+  }
+
+  // If input rank is 0, it means that one of following case is occurred.
+  // - Input is scalar : result is always scalar
+  // - Input shape signature is not inferenced : cannot infer output shape signauture
+  // Therefore, when input signature rank is 0, always return empty signature.
+  if (input_signature.rank() == 0)
+    return output_signature;
+
+  // When reduction_indices is not constant
+  auto reduction_indices = dynamic_cast<const luci::CircleConst *>(indices);
+  if (reduction_indices == nullptr)
+  {
+    if (keep_dims)
+    {
+      // If keep_dims is true, rank is not changed.
+      output_signature.rank(input_signature.rank());
+      for (uint32_t i = 0; i < output_signature.rank(); ++i)
+        output_signature.dim(i) = -1;
+    }
+    else
+    {
+      // There is no way to inference for this case.
+      // Do nothing to return empty signature.
+      INFO(l) << "[CircleShapeSignatureInferenceHelper] " << circle_node->name() << std::endl;
+      INFO(l) << " reduced_signature : cannot infer because of non-constant node" << std::endl;
+    }
+
+    return output_signature;
+  }
+
+  std::vector<int32_t> reduction_values;
+  if (reduction_indices->dtype() == loco::DataType::S32)
+  {
+    auto reduction_size = reduction_indices->size<loco::DataType::S32>();
+    for (uint32_t i = 0; i < reduction_size; ++i)
+    {
+      int32_t axis = reduction_indices->at<loco::DataType::S32>(i);
+      if (axis < 0)
+        axis += input_signature.rank();
+
+      if (!(0 <= axis && axis < static_cast<int32_t>(input_signature.rank())))
+        INTERNAL_EXN_V("Invalid reduction axis for REDUCER", oops::to_uint32(axis));
+
+      reduction_values.push_back(axis);
+    }
+  }
+  else if (reduction_indices->dtype() == loco::DataType::S64)
+  {
+    auto reduction_size = reduction_indices->size<loco::DataType::S64>();
+    for (uint32_t i = 0; i < reduction_size; ++i)
+    {
+      int32_t axis = static_cast<int32_t>(reduction_indices->at<loco::DataType::S64>(i));
+      if (axis < 0)
+        axis += input_signature.rank();
+
+      if (!(0 <= axis && axis < static_cast<int32_t>(input_signature.rank())))
+        INTERNAL_EXN_V("Invalid reduction axis for REDUCER", oops::to_uint32(axis));
+
+      reduction_values.push_back(axis);
+    }
+  }
+  else
+  {
+    INTERNAL_EXN("Wrong reduction axis type, Only INT32, INT64 supported.");
+  }
+
+  if (keep_dims)
+  {
+    output_signature.rank(input_signature.rank());
+    for (uint32_t i = 0; i < input_signature.rank(); ++i)
+      output_signature.dim(i) = input_signature.dim(i);
+    for (uint32_t i = 0; i < reduction_values.size(); ++i)
+      output_signature.dim(reduction_values.at(i)) = 1;
+  }
+  else
+  {
+    std::vector<bool> check_reduce(input_signature.rank(), false);
+    for (uint32_t i = 0; i < reduction_values.size(); ++i)
+      check_reduce.at(reduction_values.at(i)) = true;
+
+    uint32_t reduce_cnt = 0;
+    for (uint32_t i = 0; i < check_reduce.size(); ++i)
+      if (check_reduce.at(i))
+        ++reduce_cnt;
+
+    output_signature.rank(input_signature.rank() - reduce_cnt);
+    for (uint32_t i = 0, j = 0; i < check_reduce.size(); ++i)
+      if (check_reduce.at(i) == false)
+        output_signature.dim(j++) = input_signature.dim(i);
+  }
+
+  return output_signature;
+}
+
+ShapeSignature input_arg_signature(const luci::CircleNode *node, uint32_t index)
+{
+  auto circle_input = loco::must_cast<luci::CircleNode *>(node->arg(index));
+  return circle_input->shape_signature();
+}
+
+} // namespace ssinf
+
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleTypeInference.cpp b/compiler/luci/service/src/CircleTypeInference.cpp
index aa8524a..b4755b5 100644
--- a/compiler/luci/service/src/CircleTypeInference.cpp
+++ b/compiler/luci/service/src/CircleTypeInference.cpp
@@ -16,6 +16,8 @@
 
 #include "luci/Service/CircleTypeInference.h"
 
+#include <luci/Log.h>
+
 #include <loco.h>
 #include <loco/Service/TypeInference.h>
 
@@ -70,3 +72,47 @@ circle::TensorType TypeInference::get(loco::Node *node)
 }
 
 } // namespace luci
+
+namespace
+{
+
+bool inputs_dtype_ready(const luci::CircleNode *node)
+{
+  for (uint32_t arity = 0; arity < node->arity(); ++arity)
+  {
+    if (node->dtype() == loco::DataType::Unknown)
+      return false;
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+namespace tinf
+{
+
+bool Rule::infer(const luci::CircleNode *circle_node, loco::DataType &dtype) const
+{
+  LOGGER(l);
+  VERBOSE(l, 1) << "[CircleTypeInference] " << circle_node->name();
+  VERBOSE(l, 1) << "  before: " << static_cast<int>(circle_node->dtype());
+
+  if (!inputs_dtype_ready(circle_node))
+  {
+    VERBOSE(l, 1) << "   after: Some inputs are not ready for inference";
+    return false;
+  }
+
+  Algorithm alg;
+  dtype = circle_node->accept(&alg);
+
+  VERBOSE(l, 1) << "   after: " << static_cast<int>(dtype);
+
+  return true;
+}
+
+} // namespace tinf
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleTypeInferenceHelper.cpp b/compiler/luci/service/src/CircleTypeInferenceHelper.cpp
new file mode 100644
index 0000000..75cd9f7
--- /dev/null
+++ b/compiler/luci/service/src/CircleTypeInferenceHelper.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleTypeInferenceHelper.h"
+
+namespace luci
+{
+namespace tinf
+{
+
+// Helper function will be added
+
+} // namespace tinf
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleInput.cpp b/compiler/luci/service/src/Nodes/CircleInput.cpp
new file mode 100644
index 0000000..24eab7b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleInput.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleInput *node)
+{
+  return node->shape_signature();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMean.cpp b/compiler/luci/service/src/Nodes/CircleMean.cpp
new file mode 100644
index 0000000..a787136
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMean.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleMean *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOutput.cpp b/compiler/luci/service/src/Nodes/CircleOutput.cpp
new file mode 100644
index 0000000..d4c8da2
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOutput.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutput *node)
+{
+  return input_arg_signature(node, 0);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp b/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp
new file mode 100644
index 0000000..e0f13c4
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutputDummy *) { return ShapeSignature(); }
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp b/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp
new file mode 100644
index 0000000..75bbbb3
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutputExclude *)
+{
+  return ShapeSignature();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceAny.cpp b/compiler/luci/service/src/Nodes/CircleReduceAny.cpp
new file mode 100644
index 0000000..27da814
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceAny.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceAny *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMax.cpp b/compiler/luci/service/src/Nodes/CircleReduceMax.cpp
new file mode 100644
index 0000000..48d9cb9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceMax.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceMax *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMin.cpp b/compiler/luci/service/src/Nodes/CircleReduceMin.cpp
new file mode 100644
index 0000000..9a99971
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceMin.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceMin *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceProd.cpp b/compiler/luci/service/src/Nodes/CircleReduceProd.cpp
new file mode 100644
index 0000000..a9d381a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceProd.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceProd *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRelu.cpp b/compiler/luci/service/src/Nodes/CircleRelu.cpp
new file mode 100644
index 0000000..a7a7f6f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRelu.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleRelu *node)
+{
+  return input_arg_signature(node, 0);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRelu6.cpp b/compiler/luci/service/src/Nodes/CircleRelu6.cpp
new file mode 100644
index 0000000..92a596d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRelu6.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleRelu6 *node)
+{
+  return input_arg_signature(node, 0);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp b/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp
new file mode 100644
index 0000000..1e8d997
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleReluN1To1 *node)
+{
+  return input_arg_signature(node, 0);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSum.cpp b/compiler/luci/service/src/Nodes/CircleSum.cpp
new file mode 100644
index 0000000..9ef90e8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSum.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeSignatureInference.h>
+
+namespace luci
+{
+
+ShapeSignature ssinf::Algorithm::visit(const luci::CircleSum *node)
+{
+  return legalized_signature(
+      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/ShapeDescription.cpp b/compiler/luci/service/src/ShapeDescription.cpp
index cbc302f..01a638f 100644
--- a/compiler/luci/service/src/ShapeDescription.cpp
+++ b/compiler/luci/service/src/ShapeDescription.cpp
@@ -23,6 +23,19 @@
 namespace luci
 {
 
+ShapeDescription to_shape_description(const luci::CircleNode *circle_node)
+{
+  ShapeDescription res;
+
+  res._rank_known = true;
+
+  res._dims.resize(circle_node->rank());
+  for (uint32_t i = 0; i < circle_node->rank(); ++i)
+    res._dims.at(i) = circle_node->dim(i).value();
+
+  return res;
+}
+
 ShapeDescription to_shape_description(const loco::TensorShape &shape)
 {
   ShapeDescription res;
diff --git a/compiler/luci/service/src/Validate.cpp b/compiler/luci/service/src/Validate.cpp
index d224fd1..3f732b6 100644
--- a/compiler/luci/service/src/Validate.cpp
+++ b/compiler/luci/service/src/Validate.cpp
@@ -42,6 +42,19 @@ std::ostream &operator<<(std::ostream &os, const loco::TensorShape &tensor_shape
   return os;
 }
 
+std::ostream &operator<<(std::ostream &os, const luci::CircleNode *circle_node)
+{
+  os << "[";
+  for (uint32_t r = 0; r < circle_node->rank(); ++r)
+  {
+    if (r)
+      os << ",";
+    os << circle_node->dim(r).value();
+  }
+  os << "]";
+  return os;
+}
+
 /**
  * @brief  returns a node that is CircleOutput with index is out_index in nodes
  */
@@ -80,23 +93,28 @@ bool validate_shape_dtype(loco::Graph *g)
     if (dynamic_cast<luci::CircleOutputExclude *>(circle_node))
       continue;
 
-    assert(loco::shape_known(circle_node));
+    assert(circle_node->shape_status() != luci::ShapeStatus::UNDEFINED);
 
     // check if output node shape is same as graph output shape
-    auto co_tensor_shape = loco::shape_get(circle_node).as<loco::TensorShape>();
     auto go_tensor_shape = graph_out->shape();
     assert(go_tensor_shape);
-    if (!(co_tensor_shape == *go_tensor_shape))
+
+    bool is_shape_valid = (circle_node->rank() == go_tensor_shape->rank());
+    for (uint32_t i = 0; is_shape_valid && i < circle_node->rank(); ++i)
+      if (circle_node->dim(i).value() != go_tensor_shape->dim(i).value())
+        is_shape_valid = false;
+
+    if (is_shape_valid == false)
     {
       INFO(l) << "[luci] Shape for output #" << out_index << " not same " << std::endl;
-      INFO(l) << "[luci]    " << circle_node->name() << " " << co_tensor_shape << " vs "
+      INFO(l) << "[luci]    " << circle_node->name() << " " << circle_node << " vs "
               << *go_tensor_shape << std::endl;
       return false;
     }
 
     // check if data type match
-    assert(loco::dtype_known(circle_node));
-    if (graph_out->dtype() != loco::dtype_get(circle_node))
+    assert(circle_node->dtype() != loco::DataType::Unknown);
+    if (graph_out->dtype() != circle_node->dtype())
     {
       INFO(l) << "[luci] Type for output #" << out_index << " not same " << std::endl;
       return false;
@@ -106,6 +124,55 @@ bool validate_shape_dtype(loco::Graph *g)
   return true;
 }
 
+bool validate_shape_signature(loco::Graph *g)
+{
+  LOGGER(l);
+
+  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    const auto shape_signature = circle_node->shape_signature();
+
+    if (shape_signature.rank() == 0)
+      continue;
+
+    // Rank of shape and shape signature should be same
+    if (circle_node->rank() != shape_signature.rank())
+    {
+      INFO(l) << "[luci] Rank of shape signature for " << circle_node->name() << " do not match"
+              << std::endl;
+      return false;
+    }
+
+    bool has_unknown = false;
+
+    // If shape siganture is not -1, dimension value should be same
+    for (uint32_t d = 0; d < shape_signature.rank(); ++d)
+    {
+      if (shape_signature.dim(d) != -1 &&
+          shape_signature.dim(d) != (int32_t)(circle_node->dim(d).value()))
+      {
+        INFO(l) << "[luci] Dimension " << d << "of shape signature for " << circle_node->name()
+                << " do not match" << std::endl;
+        return false;
+      }
+
+      if (shape_signature.dim(d) == -1)
+        has_unknown = true;
+    }
+
+    // Shape signature should have at least one -1 value.
+    if (!has_unknown)
+    {
+      INFO(l) << "[luci] Shape signature in " << circle_node->name()
+              << " do not have unknown dimension" << std::endl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
 } // namespace
 
 namespace luci
@@ -119,6 +186,9 @@ bool validate(loco::Graph *g)
   if (!validate_shape_dtype(g))
     return false;
 
+  if (!validate_shape_signature(g))
+    return false;
+
   // TODO add more validation
 
   return true;
diff --git a/compiler/luci/tester/src/ReadTester.cpp b/compiler/luci/tester/src/ReadTester.cpp
index a1aead1..f270a23 100644
--- a/compiler/luci/tester/src/ReadTester.cpp
+++ b/compiler/luci/tester/src/ReadTester.cpp
@@ -21,6 +21,9 @@
 #include <luci/Pass/ShapeInferencePass.h>
 #include <luci/Pass/TypeInferencePass.h>
 
+// Following passes will be removed after refactoring is finished
+#include <luci/Pass/MigrateLegacyShapeDtypePass.h>
+
 #include <iostream>
 #include <map>
 #include <string>
@@ -95,6 +98,12 @@ int entry(int argc, char **argv)
       while (pass.run(graph) == true)
         ;
     }
+    {
+      // This pass will be removed after refactoring is finished
+      luci::MigrateLegacyShapeDtypePass pass;
+      while (pass.run(graph) == true)
+        ;
+    }
 
     if (!luci::validate(graph))
       return 255;
diff --git a/compiler/luci/tester/src/WriteTester.cpp b/compiler/luci/tester/src/WriteTester.cpp
index aa7085c..9a6e8de 100644
--- a/compiler/luci/tester/src/WriteTester.cpp
+++ b/compiler/luci/tester/src/WriteTester.cpp
@@ -23,6 +23,9 @@
 #include <luci/CircleExporter.h>
 #include <oops/InternalExn.h>
 
+// Following passes will be removed after refactoring is finished
+#include <luci/Pass/MigrateLegacyShapeDtypePass.h>
+
 #include <fstream>
 #include <iostream>
 #include <map>
@@ -139,6 +142,12 @@ int entry(int argc, char **argv)
       while (pass.run(graph) == true)
         ;
     }
+    {
+      // This pass will be removed after refactoring is finished
+      luci::MigrateLegacyShapeDtypePass pass;
+      while (pass.run(graph) == true)
+        ;
+    }
 
     if (!luci::validate(graph))
       return 255;
diff --git a/compiler/moco/support/src/TFShapeInferenceHelper.cpp b/compiler/moco/support/src/TFShapeInferenceHelper.cpp
index 13e514a..605fb9c 100644
--- a/compiler/moco/support/src/TFShapeInferenceHelper.cpp
+++ b/compiler/moco/support/src/TFShapeInferenceHelper.cpp
@@ -66,7 +66,7 @@ private:
 };
 
 /**
- * @breif  Expand shape x and y to same rank by align right and filling with 1
+ * @brief  Expand shape x and y to same rank by align right and filling with 1
  */
 void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 {
@@ -86,7 +86,7 @@ void expand_rank(loco::TensorShape &x, loco::TensorShape &y)
 }
 
 /**
- * @breif  Returns shape of expanded dimension of input x and y having same rank
+ * @brief  Returns shape of expanded dimension of input x and y having same rank
  */
 loco::TensorShape expand_dimension(const loco::TensorShape &x, const loco::TensorShape &y)
 {
diff --git a/compiler/nnc/include/Definitions.h.in b/compiler/nnc/include/Definitions.h.in
index 070cdd2..bd86429 100644
--- a/compiler/nnc/include/Definitions.h.in
+++ b/compiler/nnc/include/Definitions.h.in
@@ -7,12 +7,12 @@
  */
 
 /**
- * @breif absolute path to installation directory of *nnc* project
+ * @brief absolute path to installation directory of *nnc* project
  */
 #define NNC_ROOT_PATH "@NNC_INSTALL_PATH@"
 
 /**
- * @breif absolute path to directory contains libraries
+ * @brief absolute path to directory contains libraries
  */
 #define NNC_LIB_PATH "@NNC_INSTALL_LIB_PATH@"
 
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index 62a4978..d4e3269 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -161,6 +161,7 @@ Current transformation options are
 - make_batchnorm_gamma_positive: This makes negative gamma of batch normalization into a small positive value (1e-10).
   Note that this pass can change the execution result of the model.
   So, use it only when the impact is known to be acceptable.
+- replace_cw_mul_add_with_depthwise_conv: This will replace channel-wise Mul/Add with DepthwiseConv2D.
 - resolve_customop_add: This will convert Custom(Add) to normal Add operator
 - resolve_customop_batchmatmul: This will convert Custom(BatchMatMul) to
   normal BatchMatMul operator
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index f2d8230..fbe3d52 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -87,24 +87,19 @@ def main():
     # verify arguments
     _verify_arg(parser, args)
 
-    # get file path to log
+    # make a command to run given backend driver
     dir_path = os.path.dirname(os.path.realpath(__file__))
-    logfile_path = os.path.realpath(args.output_path) + '.log'
-
-    with open(logfile_path, 'wb') as f:
-        # make a command to run given backend driver
-        codegen_path = os.path.join(dir_path, getattr(args, 'backend') + '-compile')
-        codegen_cmd = [codegen_path] + unknown_args
-
-        f.write((' '.join(codegen_cmd) + '\n').encode())
-
-        # run backend driver
-        with subprocess.Popen(
-                codegen_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                bufsize=1) as p:
-            for line in p.stdout:
-                sys.stdout.buffer.write(line)
-                f.write(line)
+    codegen_path = os.path.join(dir_path, getattr(args, 'backend') + '-compile')
+    codegen_cmd = [codegen_path] + unknown_args
+    if _utils._is_valid_attr(args, 'command'):
+        codegen_cmd += getattr(args, 'command').split()
+
+    # run backend driver
+    with subprocess.Popen(
+            codegen_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+            bufsize=1) as p:
+        for line in p.stdout:
+            sys.stdout.buffer.write(line)
 
 
 if __name__ == '__main__':
diff --git a/compiler/one-cmds/one-import-bcq b/compiler/one-cmds/one-import-bcq
index 5ea1f57..50f5879 100644
--- a/compiler/one-cmds/one-import-bcq
+++ b/compiler/one-cmds/one-import-bcq
@@ -43,13 +43,13 @@ def _get_parser():
     converter_version.add_argument(
         '--v1',
         action='store_const',
-        dest='converter_version',
+        dest='converter_version_cmd',
         const='--v1',
         help='use TensorFlow Lite Converter 1.x')
     converter_version.add_argument(
         '--v2',
         action='store_const',
-        dest='converter_version',
+        dest='converter_version_cmd',
         const='--v2',
         help='use TensorFlow Lite Converter 2.x')
 
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index 49009d3..3a7c69a 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -52,8 +52,6 @@ def _get_parser():
         const='--v2',
         help='use TensorFlow Lite Converter 2.x')
 
-    #converter_version.set_defaults(converter_version='--v1')
-
     parser.add_argument('--converter_version', type=str, help=argparse.SUPPRESS)
 
     # input model format
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 4c5f109..f03bb8d 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -73,6 +73,10 @@ def _get_parser():
     circle2circle_group.add_argument(
         '--fuse_instnorm', action='store_true', help='fuse ops to InstanceNorm operator')
     circle2circle_group.add_argument(
+        '--replace_cw_mul_add_with_depthwise_conv',
+        action='store_true',
+        help='replace channel-wise Mul/Add with DepthwiseConv2D')
+    circle2circle_group.add_argument(
         '--resolve_customop_add',
         action='store_true',
         help='convert Custom(Add) op to Add op')
diff --git a/compiler/one-cmds/tests/one-build_001.cfg b/compiler/one-cmds/tests/one-build_001.cfg
index 8524bbd..b022ba7 100644
--- a/compiler/one-cmds/tests/one-build_001.cfg
+++ b/compiler/one-cmds/tests/one-build_001.cfg
@@ -13,7 +13,7 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
 
 [one-optimize]
 input_path=inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-build_002.cfg b/compiler/one-cmds/tests/one-build_002.cfg
index 1830776..bbf0915 100644
--- a/compiler/one-cmds/tests/one-build_002.cfg
+++ b/compiler/one-cmds/tests/one-build_002.cfg
@@ -13,7 +13,7 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
 
 [one-optimize]
 input_path=inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-build_neg_002.cfg b/compiler/one-cmds/tests/one-build_neg_002.cfg
index 360c601..99db966 100644
--- a/compiler/one-cmds/tests/one-build_neg_002.cfg
+++ b/compiler/one-cmds/tests/one-build_neg_002.cfg
@@ -13,7 +13,7 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
 
 [one-optimize]
 input_path=inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-build_neg_003.cfg b/compiler/one-cmds/tests/one-build_neg_003.cfg
index 91e7875..fa027cb 100644
--- a/compiler/one-cmds/tests/one-build_neg_003.cfg
+++ b/compiler/one-cmds/tests/one-build_neg_003.cfg
@@ -4,7 +4,7 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
 
 [one-optimize]
 input_path=inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-build_neg_004.cfg b/compiler/one-cmds/tests/one-build_neg_004.cfg
index 4d312c4..571077b 100644
--- a/compiler/one-cmds/tests/one-build_neg_004.cfg
+++ b/compiler/one-cmds/tests/one-build_neg_004.cfg
@@ -13,7 +13,7 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
 
 [one-optimize]
 input_path=inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-import_002.cfg b/compiler/one-cmds/tests/one-import_002.cfg
index 9a90abe..8d6ae2c 100644
--- a/compiler/one-cmds/tests/one-import_002.cfg
+++ b/compiler/one-cmds/tests/one-import_002.cfg
@@ -13,4 +13,4 @@ output_path=inception_v3.circle
 input_arrays=input
 input_shapes=1,299,299,3
 output_arrays=InceptionV3/Predictions/Reshape_1
-v2=True
+converter_version=v2
diff --git a/compiler/one-cmds/tests/one-import_003.cfg b/compiler/one-cmds/tests/one-import_003.cfg
new file mode 100644
index 0000000..b679ebd
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_003.cfg
@@ -0,0 +1,13 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+model_format=saved_model
+input_path=test_saved_model
+output_path=test_saved_model.circle
diff --git a/compiler/one-cmds/tests/one-import_003.test b/compiler/one-cmds/tests/one-import_003.test
new file mode 100644
index 0000000..6093f14
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_003.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import of TF 2.x saved model
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-import_003.cfg"
+outputfile="test_saved_model.circle"
+
+rm -f ${outputfile}
+
+# run test
+one-import tf -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-import_004.cfg b/compiler/one-cmds/tests/one-import_004.cfg
new file mode 100644
index 0000000..d28c8df
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_004.cfg
@@ -0,0 +1,13 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+model_format=keras_model
+input_path=test_keras_model.h5
+output_path=test_keras_model.circle
diff --git a/compiler/one-cmds/tests/one-import_004.test b/compiler/one-cmds/tests/one-import_004.test
new file mode 100644
index 0000000..9d10c43
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_004.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import of TF 2.x keras model
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-import_004.cfg"
+outputfile="test_keras_model.circle"
+
+rm -f ${outputfile}
+
+# run test
+one-import tf -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/prepare_test_materials.sh b/compiler/one-cmds/tests/prepare_test_materials.sh
index cb1067e..bc3d65d 100644
--- a/compiler/one-cmds/tests/prepare_test_materials.sh
+++ b/compiler/one-cmds/tests/prepare_test_materials.sh
@@ -63,6 +63,20 @@ if [[ ! -s "inception_v3_test_data.h5" ]]; then
   --output_path inception_v3_test_data.h5
 fi
 
+if [[ ! -d "test_saved_model" ]]; then
+    rm -rf test_saved_model.zip
+    wget https://github.com/Samsung/ONE/files/5516226/test_saved_model.zip
+    unzip test_saved_model.zip
+    # https://github.com/Samsung/ONE/issues/4268#issuecomment-724578237
+fi
+
+if [[ ! -s "test_keras_model.h5" ]]; then
+    rm -rf test_keras_model.zip
+    wget https://github.com/Samsung/ONE/files/5520777/test_keras_model.zip
+    unzip test_keras_model.zip
+    # https://github.com/Samsung/ONE/issues/4268#issuecomment-725025805
+fi
+
 # prepare 'inception_v3.circle' file used for quantization test
 inputfile="./inception_v3.pb"
 outputfile="./inception_v3.circle"
diff --git a/compiler/oops/include/oops/InternalExn.h b/compiler/oops/include/oops/InternalExn.h
index 0e11085..e14332b 100644
--- a/compiler/oops/include/oops/InternalExn.h
+++ b/compiler/oops/include/oops/InternalExn.h
@@ -40,20 +40,20 @@ class InternalExn : public std::exception
 {
 public:
   InternalExn(const char *filename, const int line, const std::string &msg)
-      : _filename(filename), _line(line), _msg(msg)
+      : _filename(filename), _line(to_uint32(line)), _msg(msg)
   {
     construct_full_msg();
   }
 
   explicit InternalExn(const char *filename, const int line, const std::string &msg, uint32_t val)
-      : _filename(filename), _line(line), _msg(msg + ": " + std::to_string(val))
+      : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + std::to_string(val))
   {
     construct_full_msg();
   }
 
   explicit InternalExn(const char *filename, const int line, const std::string &msg,
                        const std::string &val)
-      : _filename(filename), _line(line), _msg(msg + ": " + val)
+      : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + val)
   {
     construct_full_msg();
   }
diff --git a/compiler/pota-quantization-value-test/CMakeLists.txt b/compiler/pota-quantization-value-test/CMakeLists.txt
index 73b9ead..80661e5 100644
--- a/compiler/pota-quantization-value-test/CMakeLists.txt
+++ b/compiler/pota-quantization-value-test/CMakeLists.txt
@@ -1,6 +1,12 @@
 unset(QUANTIZATION_VALUE_TEST)
 unset(QUANTIZATION_VALUE_TEST_WITH_PARAM)
 
+nnas_find_package(FlatBuffers QUIET)
+if(NOT FlatBuffers_FOUND)
+  message(STATUS "Build pota-quantization-value-test: FAILED (missing FlatBuffers)")
+  return()
+endif(NOT FlatBuffers_FOUND)
+
 macro(addTest NAME GRANULARITY DTYPE)
   list(APPEND QUANTIZATION_VALUE_TEST ${NAME})
   list(APPEND QUANTIZATION_VALUE_TEST_WITH_PARAM ${NAME} ${GRANULARITY} ${DTYPE})
@@ -14,8 +20,12 @@ include("test.local.lst" OPTIONAL)
 unset(TEST_DEPS)
 
 get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+get_target_property(SCHEMA_BIN_PATH mio_circle BINARY_DIR)
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/gen_h5_explicit_inputs.py"
+               "${CMAKE_CURRENT_BINARY_DIR}/gen_h5_explicit_inputs.py" COPYONLY)
 
-set(VIRTUALENV "${NNCC_OVERLAY_DIR}/venv_1_13_2")
+set(VIRTUALENV "${NNCC_OVERLAY_DIR}/venv_2_3_0")
 
 ###
 ### Generate test.config
@@ -35,7 +45,21 @@ add_custom_command(
   COMMENT "Generate test configuration"
 )
 
-list(APPEND TEST_DEPS "${TEST_CONFIG}")
+###
+### Generate python interface for circle schema
+###
+set(CIRCLE_SCHEMA_PYTHON_DIR "${CMAKE_CURRENT_BINARY_DIR}/circle")
+
+add_custom_command(
+  OUTPUT ${CIRCLE_SCHEMA_PYTHON_DIR}
+  COMMAND ${CMAKE_COMMAND} -E remove_directory "${CIRCLE_SCHEMA_PYTHON_DIR}"
+  COMMAND "$<TARGET_FILE:flatbuffers::flatc>" --python
+  -o "${CMAKE_CURRENT_BINARY_DIR}" "${SCHEMA_BIN_PATH}/schema.fbs"
+  DEPENDS flatbuffers::flatc
+  COMMENT "Generate python interface for circle schema"
+)
+
+list(APPEND TEST_DEPS "${TEST_CONFIG}" "${CIRCLE_SCHEMA_PYTHON_DIR}")
 
 # This enforces CMake to generate all the dependencies during "build" phase
 add_custom_target(pota_quantization_value_test_deps ALL DEPENDS ${TEST_DEPS})
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/beta.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/beta.json
new file mode 100644
index 0000000..fa2cdae
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/beta.json
@@ -0,0 +1,20 @@
+{
+  "weights": [
+    1,
+    0,
+    1,
+    1
+  ],
+  "scale": [
+    0.7023000121116638,
+    0.3091999888420105,
+    0.7552000284194946,
+    0.2728999853134155
+  ],
+  "zero_point": [
+    0,
+    1,
+    0,
+    0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/gamma.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/gamma.json
new file mode 100644
index 0000000..393a44a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/gamma.json
@@ -0,0 +1,20 @@
+{
+  "weights": [
+    1,
+    0,
+    1,
+    0
+  ],
+  "scale": [
+    0.012299999594688416,
+    0.33239999413490295,
+    0.23240000009536743,
+    3.3359999656677246
+  ],
+  "zero_point": [
+    0,
+    1,
+    0,
+    1
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ifm.json
new file mode 100644
index 0000000..94c4e0f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.003919127397239208,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ofm.json
new file mode 100644
index 0000000..27a1c85
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.051219820976257324,
+  "zero_point": 104.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ifm.json
new file mode 100644
index 0000000..910e855
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.006417479291558266,
+  "max": 0.9993774032592774
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ofm.json
new file mode 100644
index 0000000..190da30
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/channel/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -5.316554107666015,
+  "max": 7.744499607086182
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/beta.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/beta.json
new file mode 100644
index 0000000..9dcefd5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/beta.json
@@ -0,0 +1,10 @@
+{
+  "weights": [
+    242,
+    0,
+    255,
+    139
+  ],
+  "scale": 0.004174117464572191,
+  "zero_point": 74.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/gamma.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/gamma.json
new file mode 100644
index 0000000..6d85a1e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/gamma.json
@@ -0,0 +1,10 @@
+{
+  "weights": [
+    239,
+    214,
+    255,
+    0
+  ],
+  "scale": 0.013993725180625916,
+  "zero_point": 238.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ifm.json
new file mode 100644
index 0000000..df3df56
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.003914226312190294,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ofm.json
new file mode 100644
index 0000000..098816a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.04870154336094856,
+  "zero_point": 122.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 0000000..d2e7923
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.011221568882465362,
+  "max": 0.9981276893615723
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 0000000..b4ea586
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/InstanceNorm_001/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -5.94246238708496,
+  "max": 6.4764308166503906
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json
index 5f6db8d..6f99899 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json
@@ -2,12 +2,20 @@
   "weights": [
     [
       [
-        6553,
-        19660,
-        32767
+        1,
+        1,
+        1
       ]
     ]
   ],
-  "scale": 1.5259254723787308e-05,
-  "zero_point": 0.0
+  "scale": [
+    0.10000000149011612,
+    0.30000001192092896,
+    0.5
+  ],
+  "zero_point": [
+    0,
+    0,
+    0
+  ]
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json
index e75377c..7d1f4c7 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.0001509107678430155,
+  "scale": 0.00015214986342471093,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json
index e4a89e2..533c1e3 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.00015084103506524116,
+  "scale": 0.00015159364556893706,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json
index a34d48c..edbbff9 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json
@@ -1,4 +1,4 @@
 {
-  "min": -4.944893226623535,
-  "max": 4.942608108520508
+  "min": -4.985494499206543,
+  "max": 4.967269058227539
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json
index 640397c..954d5ef 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json
@@ -1,4 +1,4 @@
 {
-  "min": -2.451441249847412,
-  "max": 4.942608108520508
+  "min": -2.4895002365112306,
+  "max": 4.967269058227539
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/alpha.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/alpha.json
new file mode 100644
index 0000000..6f99899
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/alpha.json
@@ -0,0 +1,21 @@
+{
+  "weights": [
+    [
+      [
+        1,
+        1,
+        1
+      ]
+    ]
+  ],
+  "scale": [
+    0.10000000149011612,
+    0.30000001192092896,
+    0.5
+  ],
+  "zero_point": [
+    0,
+    0,
+    0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ifm.json
new file mode 100644
index 0000000..d661df3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03893596678972244,
+  "zero_point": 128.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ofm.json
new file mode 100644
index 0000000..6dfffd5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.029139429330825806,
+  "zero_point": 85.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ifm.json
new file mode 100644
index 0000000..8de6b3d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.977406520843505,
+  "max": 4.951265411376953
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ofm.json
new file mode 100644
index 0000000..c88f6ca
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.4792890548706055,
+  "max": 4.951265411376953
+}
diff --git a/compiler/pota-quantization-value-test/gen_h5_explicit_inputs.py b/compiler/pota-quantization-value-test/gen_h5_explicit_inputs.py
index 9863c80..a00cbeb 100755
--- a/compiler/pota-quantization-value-test/gen_h5_explicit_inputs.py
+++ b/compiler/pota-quantization-value-test/gen_h5_explicit_inputs.py
@@ -1,16 +1,17 @@
 #!/usr/bin/env python3
 import h5py as h5
 import numpy as np
-import tensorflow as tf
+from circle.Model import Model
+from circle.TensorType import TensorType
 import argparse
 import glob
 
 #
-# This script generates a pack of random input data (.h5) expected by the input tflite model
+# This script generates a pack of random input data (.h5) expected by the input circle model
 #
 # Basic usage:
 #   gen_h5_explicit_inputs.py --model <path/to/model/file> --input <path/to/input/directory> --output <path/to/output/file>
-#   ex: gen_h5_explicit_inputs.py --model Add_000.tflite --input Add_000 --output Add_000.input.h5
+#   ex: gen_h5_explicit_inputs.py --model Add_000.circle --input Add_000 --output Add_000.input.h5
 #   (This will create Add_000.input.h5)
 #
 # The input directory should be organized as follows
@@ -33,15 +34,30 @@ model = args.model
 input = args.input
 output = args.output
 
-# Build TFLite interpreter. (to get the information of model input)
-interpreter = tf.lite.Interpreter(model)
-input_details = interpreter.get_input_details()
+with open(model, 'rb') as f:
+    buf = f.read()
+    circle_model = Model.GetRootAsModel(buf, 0)
+
+# Assume one subgraph
+assert (circle_model.SubgraphsLength() == 1)
+graph = circle_model.Subgraphs(0)
+inputs = graph.InputsAsNumpy()
 
 # Create h5 file
 h5_file = h5.File(output, 'w')
 group = h5_file.create_group("value")
 group.attrs['desc'] = "Input data for " + model
 
+
+def toNumpyType(circle_type):
+    if circle_type == TensorType.UINT8:
+        return np.uint8
+    if circle_type == TensorType.FLOAT32:
+        return np.float32
+    if circle_type == TensorType.INT16:
+        return np.int16
+
+
 # Input files
 records = sorted(glob.glob(input + "/*.txt"))
 for i, record in enumerate(records):
@@ -51,9 +67,10 @@ for i, record in enumerate(records):
         lines = f.readlines()
         for j, line in enumerate(lines):
             data = np.array(line.split(','))
-            input_detail = input_details[j]
-            input_data = np.array(
-                data.reshape(input_detail["shape"]), input_detail["dtype"])
+            input_index = inputs[j]
+            tensor = graph.Tensors(input_index)
+            np_type = toNumpyType(tensor.Type())
+            input_data = np.array(data.reshape(tensor.ShapeAsNumpy()), np_type)
             sample.create_dataset(str(j), data=input_data)
 
 h5_file.close()
diff --git a/compiler/pota-quantization-value-test/test.lst b/compiler/pota-quantization-value-test/test.lst
index 15606b8..dd16404 100644
--- a/compiler/pota-quantization-value-test/test.lst
+++ b/compiler/pota-quantization-value-test/test.lst
@@ -13,6 +13,8 @@ addTest(DepthwiseConv2D_002 layer uint8)
 addTest(FullyConnected_003 channel uint8)
 addTest(FullyConnected_003 channel int16)
 addTest(FullyConnected_003 layer uint8)
+addTest(InstanceNorm_001 layer uint8)
+addTest(InstanceNorm_001 channel uint8)
 addTest(Mean_000 layer uint8)
 addTest(Mean_000 channel int16)
 addTest(MaxPool2D_000 layer uint8)
@@ -20,6 +22,7 @@ addTest(MaxPool2D_000 channel int16)
 addTest(Mul_001 layer uint8)
 addTest(Mul_001 channel int16)
 addTest(PRelu_001 layer uint8)
+addTest(PRelu_001 channel uint8)
 addTest(PRelu_001 channel int16)
 addTest(ReLU_000 layer uint8)
 addTest(ReLU_000 channel int16)
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/0.txt
new file mode 100644
index 0000000..5e926a2
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/0.txt
@@ -0,0 +1 @@
+0.15500909,0.32379007,0.12717001,0.60674316,0.07691418,0.437071  ,0.3737046 ,0.798342  ,0.65901846,0.40579247,0.15460491,0.80063623,0.591834  ,0.6617658 ,0.5617774 ,0.44884747,0.7996519 ,0.75895494,0.6239346 ,0.56500244,0.8955974 ,0.32503998,0.05756519,0.11889575,0.19635268,0.33958906,0.916527  ,0.16366032,0.51954055,0.2615102 ,0.07677322,0.6970092 ,0.27848312,0.97694606,0.73990864,0.96292055
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/1.txt
new file mode 100644
index 0000000..eb5de0c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/1.txt
@@ -0,0 +1 @@
+0.85332185,0.03102963,0.54344934,0.6300742 ,0.3323267 ,0.1701224 ,0.36199054,0.23949413,0.11960976,0.668403  ,0.7907452 ,0.4377144 ,0.87145853,0.75605077,0.37314144,0.3622036 ,0.4321453 ,0.8770253 ,0.10936793,0.0734281 ,0.2922192 ,0.5829591 ,0.5422962 ,0.84274834,0.48475483,0.23154257,0.20037153,0.27911612,0.30018023,0.23753181,0.98804647,0.61455756,0.90376633,0.8255312 ,0.21020697,0.6272272 
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/2.txt
new file mode 100644
index 0000000..16561ef
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/2.txt
@@ -0,0 +1 @@
+0.29736656,0.5712386 ,0.55447775,0.9014779 ,0.6208391 ,0.3413809 ,0.043885  ,0.5474101 ,0.8642339 ,0.05225753,0.36101478,0.15561381,0.776422  ,0.9997885 ,0.35188794,0.23418508,0.0882741 ,0.5797471 ,0.99945694,0.22190607,0.12337059,0.3701574 ,0.65161157,0.9830193 ,0.46270686,0.10077237,0.23681253,0.8734158 ,0.8358533 ,0.08817147,0.3845248 ,0.12799203,0.66830546,0.14838815,0.90201443,0.21123447
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/3.txt
new file mode 100644
index 0000000..deba38b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/3.txt
@@ -0,0 +1 @@
+0.92424273,0.35776526,0.0776509 ,0.93697083,0.6559925 ,0.78421926,0.7511033 ,0.71389145,0.52217877,0.41876563,0.3560251 ,0.5862293 ,0.53027606,0.32203177,0.24654935,0.55851364,0.35312092,0.38102064,0.21245371,0.87299466,0.94972914,0.54950166,0.3445233 ,0.98951054,0.37458083,0.3778964 ,0.64035404,0.10410193,0.18511558,0.1942945 ,0.07018933,0.6113747 ,0.38076922,0.08337755,0.98258   ,0.91440874
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/4.txt
new file mode 100644
index 0000000..78b783a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/channel/uint8/4.txt
@@ -0,0 +1 @@
+0.3790198 ,0.6347678 ,0.42544237,0.37033263,0.08057033,0.49041638,0.61705315,0.15411597,0.6455052 ,0.6857795 ,0.9613043 ,0.60357374,0.57679754,0.22550431,0.05105425,0.8641173 ,0.65559083,0.18274343,0.8963692 ,0.22369736,0.3133119 ,0.27507883,0.00539197,0.6846556 ,0.5969273 ,0.78488904,0.87746257,0.15459861,0.23133573,0.59048635,0.07172906,0.28935516,0.02084327,0.09926946,0.02687503,0.7306079 
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/0.txt
new file mode 100644
index 0000000..25b600c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/0.txt
@@ -0,0 +1 @@
+0.641226  ,0.68639857,0.87044334,0.9448475 ,0.21544299,0.5202749 ,0.5077167 ,0.23931624,0.5712026 ,0.4167988 ,0.56711906,0.52392703,0.42762014,0.5277072 ,0.03028643,0.18017273,0.8823869 ,0.5752544 ,0.09368648,0.50277   ,0.784248  ,0.04220072,0.55217946,0.75145644,0.7957966 ,0.6563401 ,0.54975605,0.17231019,0.4219812 ,0.27839735,0.5850074 ,0.24070603,0.00957893,0.3669335 ,0.03722228,0.8705231 
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/1.txt
new file mode 100644
index 0000000..caadfed
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/1.txt
@@ -0,0 +1 @@
+0.76871806,0.65729177,0.946514  ,0.4308198 ,0.65200335,0.5745432 ,0.2990488 ,0.3156028 ,0.3218111 ,0.44709972,0.9411461 ,0.4828708 ,0.5707792 ,0.10645963,0.74497086,0.3563156 ,0.07986172,0.64869064,0.73329425,0.8848129 ,0.3027897 ,0.8753744 ,0.8884493 ,0.3606782 ,0.88617206,0.20232914,0.10251648,0.6366529 ,0.20422891,0.24426484,0.6952833 ,0.21889713,0.11477511,0.40650114,0.9637219 ,0.9751801 
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/2.txt
new file mode 100644
index 0000000..bc4a494
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/2.txt
@@ -0,0 +1 @@
+0.5773043 ,0.6733178 ,0.22994593,0.32895002,0.74122405,0.6671442 ,0.1899878 ,0.35264668,0.31084946,0.3864719 ,0.7035006 ,0.46563607,0.44263086,0.2414678 ,0.7430625 ,0.72898006,0.9982008 ,0.8989132 ,0.45622516,0.17876478,0.9356994 ,0.85493064,0.73729265,0.9804242 ,0.8735895 ,0.14825071,0.33990774,0.76397645,0.14657325,0.2492199 ,0.43957144,0.20367876,0.43692476,0.28123745,0.24346785,0.21133597
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/3.txt
new file mode 100644
index 0000000..18f8666
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/3.txt
@@ -0,0 +1 @@
+0.74837255,0.7530814 ,0.05257462,0.06676125,0.26824346,0.05064487,0.23974492,0.5355457 ,0.97374374,0.38518724,0.3781766 ,0.7047476 ,0.95856845,0.09918232,0.36570287,0.5659468 ,0.8793284 ,0.7967468 ,0.99486005,0.11670698,0.42955273,0.25254622,0.06959745,0.5107888 ,0.88106513,0.3649466 ,0.7039582 ,0.8535825 ,0.3979168 ,0.9560912 ,0.17733434,0.69954944,0.35459924,0.28516313,0.75249106,0.7197228 
diff --git a/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/4.txt
new file mode 100644
index 0000000..b51c5eb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/InstanceNorm_001/layer/uint8/4.txt
@@ -0,0 +1 @@
+0.73320377,0.33635676,0.05811058,0.7032399 ,0.26380542,0.99637365,0.36622   ,0.47471517,0.5940316 ,0.39782768,0.46486765,0.5167471 ,0.61612487,0.93076104,0.8955697 ,0.5320168 ,0.41166067,0.29174343,0.07476811,0.60023075,0.0961028 ,0.77073896,0.17360727,0.48763612,0.31430086,0.37943754,0.7456216 ,0.16767363,0.9368368 ,0.09397154,0.68992966,0.5829225 ,0.7521187 ,0.06086114,0.13137193,0.22886442
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt
index 107491f..081a1e6 100644
--- a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt
@@ -1 +1 @@
- 0.5590226 ,-0.2806683 ,-1.6237477 ,-0.9041292 ,-2.2877202 , 3.4275887 , 0.7413508 ,-2.4284103 ,-0.39940628, 2.431437  ,-3.681079  ,-0.24288087, 3.3011584 ,-4.9507365 , 0.63297826, 3.0742207 ,-4.407745  ,-3.1469536 , 0.28014645, 1.7506292 ,-2.2447422 ,-0.5647249 , 4.763762  ,-1.9554822 ,-1.0236452 , 1.4784483 ,-0.15040281, 3.009691  , 4.0685706 ,-4.3577633 , 3.9074588 , 3.3200462 , 0.7937705 ,-4.491444  ,-1.5227276 ,-4.907054  , 3.0078046 ,-3.3134713 ,-4.180262  , 0.42208448,-4.764361  , 1.7373432 ,-2.4944234 , 1.3338212 , 0.5318029 , 2.0201192 , 1.274291  ,-3.891372  
+-1.9927613e+00,-1.7386111e+00, 4.0895696e+00, 3.7818990e+00, 1.9420158e+00, 2.8482721e+00, 1.9165717e+00, 3.0059583e+00, 1.8346788e+00,-1.9055414e-03, 4.9277787e+00,-2.2794118e+00, 4.4005270e+00, 4.9703922e+00,-4.5275192e+00,-4.0446317e-01,-4.9363256e+00, 4.9506269e+00, 5.5874938e-01, 3.9949589e+00,-3.8152415e-01,-4.1024357e-01,-3.8472393e+00, 4.2956004e+00, 4.8097472e+00, 1.7960385e+00, 1.6767026e+00,-2.2773645e+00, 2.6808765e+00,-3.7214172e+00, 4.0978761e+00, 3.6202488e+00,-3.3211513e+00, 3.6200387e+00,-3.6106458e+00,-3.9778764e+00, 3.8779631e+00,-4.8502750e+00,-2.1901150e+00, 3.1800017e+00, 4.6261444e+00, 3.5151103e+00, 2.8659137e-02, 4.5340648e+00, 1.9836371e+00,-2.1751235e+00,-4.6762753e+00,-3.6951694e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt
index f95a6c3..f6b31db 100644
--- a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt
@@ -1 +1 @@
--2.5172353 , 1.8682998 , 2.6845884 , 1.8813597 ,-4.6693754 ,-3.2414548 ,-3.1801097 ,-1.5670214 , 1.9862102 , 3.857179  ,-3.0402668 ,-1.4183347 ,-2.7983398 ,-4.087585  ,-1.1274861 , 1.8738103 ,-2.563316  ,-2.973781  ,-0.872552  ,-4.4504313 ,-0.9188538 , 4.5734954 , 1.3559026 , 4.943204  ,-3.6803703 , 4.577067  ,-0.6116983 , 4.5055084 , 2.5480487 , 3.7308915 ,-0.3163238 ,-0.00772368, 3.0286303 ,-0.43645218, 0.87748104,-2.6953583 , 0.21743219, 2.431181  ,-1.2284794 , 0.35975334, 0.87034357,-2.5191767 , 4.030477  ,-1.2849646 ,-4.537441  ,-0.8822066 , 4.5059347 ,-0.9273924 
+-4.7488093 , 4.805902  ,-0.29828382, 0.57486725,-4.864297  , 1.1832287 ,-1.7611881 ,-2.7058024 , 2.707353  ,-3.9832466 , 3.1243927 ,-4.795229  , 1.9835415 , 3.2291937 , 2.4303932 ,-3.556881  , 4.316894  ,-0.6444627 ,-3.8289468 , 4.012964  , 0.7878584 ,-1.8921386 , 2.779619  ,-3.762597  , 3.4239094 ,-0.9103423 ,-3.9791772 ,-2.5613685 ,-4.4910364 , 0.19411987, 4.6296096 ,-0.6827259 , 3.7645729 , 1.5309091 , 3.5163064 , 3.4726381 , 3.5372822 , 1.7671971 , 1.4374614 , 3.5783768 ,-2.4927518 , 3.9427729 , 2.431568  , 2.6959393 , 3.8100271 ,-2.099064  , 3.3663592 ,-2.0818436 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt
index 106889e..acc01cb 100644
--- a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt
@@ -1 +1 @@
- 4.523605  ,-2.1303053 , 2.7449381 ,-4.449816  ,-1.4482541 , 4.643309  ,-2.5644886 , 4.3115034 ,-4.7736797 ,-1.9451635 ,-2.1877592 , 2.3639698 ,-1.8480709 ,-4.560132  ,-0.40588248, 4.368528  ,-0.25666243, 1.1258887 , 2.33142   ,-3.8270295 ,-4.337086  ,-0.6709232 , 4.9283085 ,-3.5181348 , 2.225021  ,-0.0831629 , 2.0482597 , 3.161154  ,-0.49435407, 2.9382129 ,-1.248886  ,-3.7053974 , 1.6736145 ,-1.3524985 ,-1.4007242 ,-4.291275  ,-3.391911  , 4.803692  , 1.631321  , 0.13381048,-2.9587808 , 3.9878602 ,-3.3585925 , 4.6802793 ,-1.7605352 , 3.4168313 , 1.2318416 ,-4.40287   
+ 4.279912  ,-2.2746763 , 4.0609813 , 4.5353827 , 3.624241  ,-3.9593613 , 4.189409  ,-3.9370356 ,-2.7063863 ,-1.9987059 , 4.172294  ,-4.5454354 , 4.362368  , 2.2204642 ,-4.9866576 , 3.31571   , 0.12623785, 4.7834573 ,-1.3521448 ,-1.5408021 ,-4.6578984 ,-2.93307   ,-1.5684534 ,-1.6875995 ,-0.4278419 , 1.1314197 ,-2.9655704 ,-0.48032767,-1.9200082 , 1.3321692 , 0.87586147,-0.1761448 , 3.939337  ,-1.0270193 ,-4.807054  , 2.8373904 ,-1.1184337 ,-0.8979197 , 2.1442132 ,-2.8509672 ,-3.3741531 , 3.6592414 , 0.7632272 ,-4.11465   , 4.892313  , 4.715815  ,-4.6481915 , 0.24676175
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt
index 488c348..0f0b7a9 100644
--- a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt
@@ -1 +1 @@
- 1.249105  ,-3.2594535 ,-1.7899538 ,-4.804654  ,-2.0324056 ,-1.9959925 , 3.5215054 , 0.5371311 , 1.9365969 ,-3.130136  ,-2.3590457 ,-4.653209  ,-2.0184708 , 3.5759254 ,-1.3521014 , 1.910826  , 3.8221822 ,-2.8988552 , 0.6571995 , 1.0839036 , 3.5422468 , 2.4680734 , 0.6148754 ,-3.4008195 , 4.558109  , 2.0105803 , 0.58087206, 1.3398736 , 2.770545  , 0.29666626, 4.1851935 , 0.04321287, 2.7680604 , 4.5661645 , 4.0127945 ,-4.8027678 , 4.1711125 ,-0.24452859, 0.4101852 , 1.5963763 ,-2.8356924 , 1.2876563 , 0.90424466, 2.965566  ,-1.9058269 , 4.759825  ,-2.2063546 ,-1.1309439 
+-2.0949495 ,-1.1370499 , 4.6457314 ,-2.243915  ,-1.7996464 , 1.2268789 ,-4.938172  ,-3.2802615 , 1.8788282 , 4.4162655 ,-4.8805113 , 3.1269526 , 3.2644348 , 0.89842725,-1.4484432 ,-0.28381723, 3.046261  ,-1.0718596 ,-3.996107  ,-4.9575796 ,-2.2279077 , 1.5326967 , 4.4588428 ,-2.042381  , 4.6604958 , 4.6422915 ,-1.097833  , 3.666126  , 0.4735639 ,-4.480704  ,-4.831033  ,-0.27288163, 4.588138  , 4.5297036 , 4.3675694 ,-1.6098841 ,-3.4147859 , 2.1168516 ,-1.9529305 ,-0.12548867, 3.4388335 ,-1.4071734 , 0.9507897 , 4.8206787 , 1.676873  ,-1.7102181 , 1.7746873 , 0.02711739
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt
index a59688e..d23450d 100644
--- a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt
@@ -1 +1 @@
--3.0078897 , 1.6800234 , 4.350201  , 0.22538732, 2.9894316 ,-4.234071  , 2.733158  ,-3.8551323 , 3.9647048 , 1.4266169 , 0.78519976,-0.5334222 , 0.6681823 , 2.8409274 , 2.335872  ,-3.757666  ,-3.321705  , 2.9423573 , 1.3080943 , 1.0453726 , 3.222387  , 3.1813147 ,-1.8588669 ,-3.2523947 ,-4.4175825 , 3.7631783 ,-3.4176416 , 1.2141145 , 1.3725096 ,-1.2283872 ,-2.9829195 ,-3.6383085 ,-2.0126016 ,-3.7627625 , 4.916868  , 0.73052526,-0.02047114,-3.9506733 , 2.3569562 ,-4.247723  ,-1.8913685 , 1.7365774 , 4.59158   , 3.654596  ,-4.2133813 ,-4.6193404 ,-1.3968121 ,-3.580963  
+-4.707647  ,-4.0921726 , 3.5813692 ,-4.71081   , 3.157816  ,-3.0034213 ,-0.21858999,-1.1736552 ,-1.6042249 ,-3.93102   ,-4.0407577 , 3.7350774 ,-4.9545655 ,-1.5413756 , 0.34996858, 2.0339615 , 0.99290746,-3.9916334 ,-4.149016  ,-3.2332835 , 3.6728513 , 2.4537466 ,-3.103485  ,-0.4829316 , 4.8046784 ,-1.753812  , 4.878712  ,-1.4039769 , 1.6640003 ,-1.2041731 , 0.8046477 , 0.9196048 ,-0.6475092 , 1.1409346 , 2.0324717 ,-0.04227797,-0.5379897 , 3.205104  , 3.3556423 , 4.8447986 ,-1.9695646 ,-2.6304977 ,-3.7261262 ,-4.725599  , 2.1162436 ,-0.5631174 ,-0.5820323 , 0.8398242 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/0.txt
new file mode 100644
index 0000000..bcda22c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/0.txt
@@ -0,0 +1 @@
+ 0.29413325,-0.5246354 , 2.5049045 , 4.9534087 , 0.9885207 ,-4.9603324 ,-2.534284  ,-1.2587626 ,-4.6054525 ,-4.0071754 , 3.204513  , 1.9254771 ,-3.0781755 ,-2.225973  , 3.3524523 , 3.817767  , 3.4921055 , 4.3435416 , 3.0849605 ,-1.4030998 ,-1.0506575 ,-0.42979953,-2.2500112 , 3.4057455 , 4.5414543 , 2.9366746 , 4.8639297 ,-0.1028097 , 2.3421814 , 0.6463296 ,-4.906506  ,-0.7544193 ,-4.0089574 , 2.3837643 ,-0.62171113,-3.349577  , 0.63758767,-3.6872568 ,-2.4398334 ,-1.1556609 ,-3.116043  ,-1.9698795 , 0.7246678 , 2.1801088 ,-2.5762403 , 2.5748649 ,-2.8637013 , 2.8755338 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/1.txt
new file mode 100644
index 0000000..937e08f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/1.txt
@@ -0,0 +1 @@
+-3.5664022e+00, 3.7696166e+00,-2.0404069e+00,-3.2197843e+00, 2.0149478e-01, 4.1116104e+00, 1.9678035e+00,-7.5975507e-01,-2.1460054e+00, 4.6308274e+00,-1.8927828e+00, 3.0689645e+00,-7.0773923e-01,-6.7477709e-01,-1.6248076e+00, 2.7095401e+00, 2.9545853e+00, 8.5142839e-01,-2.7683893e-01,-2.0586762e+00,-3.5001924e+00,-1.7622359e+00, 2.2262762e+00,-4.0617161e+00,-2.4704919e+00,-3.6333869e+00, 2.3401244e+00,-4.6641917e+00,-4.0812837e-03, 1.1013873e+00, 1.4518824e-01, 2.4135842e+00, 4.1183419e+00, 3.0343807e+00,-3.7195799e-01,-9.7189492e-01,-3.0425618e+00, 4.6822820e+00,-1.7649661e+00, 3.9648254e+00,-3.1084957e+00,-7.3071235e-01,-5.1578474e-01,-3.5188673e+00,-4.7018051e+00,-4.1592669e+00,-3.5443991e-01, 1.3961188e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/2.txt
new file mode 100644
index 0000000..fb30491
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/2.txt
@@ -0,0 +1 @@
+ 4.2618856 , 0.4364266 , 0.5258691 , 3.5147502 ,-4.025428  , 3.143039  , 1.3707066 , 4.7792606 , 1.1539228 , 3.785161  ,-1.9495047 , 2.7047534 , 0.5673139 ,-0.5191105 ,-2.5284607 , 4.076998  , 2.9433093 ,-2.1924984 , 1.1020935 ,-2.126009  , 0.7586875 , 1.1708144 ,-4.594603  ,-3.252912  ,-3.057344  , 3.8008513 ,-4.9164753 ,-4.560891  , 1.724639  ,-3.0877826 , 0.55354726,-3.969067  , 4.17461   ,-1.901139  ,-4.8903475 , 4.7866077 ,-1.3506653 ,-4.2624874 , 0.8842832 , 4.672003  ,-2.5649548 ,-3.6606123 ,-1.6794366 ,-2.0534387 ,-2.9902222 , 3.078469  , 2.846819  , 1.2788221 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/3.txt
new file mode 100644
index 0000000..fb9d40a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/3.txt
@@ -0,0 +1 @@
+-2.6751792 ,-2.5436802 , 0.30533552, 1.0443643 ,-4.4327927 , 2.813772  ,-4.27514   , 2.5894637 , 2.8684394 ,-2.2010357 , 1.5827026 , 0.01609957, 0.38605672,-4.978118  ,-0.30794173, 0.7372266 ,-1.2931277 , 2.8435483 , 2.8204155 , 1.5801594 , 0.853025  , 1.0665054 ,-2.3281817 ,-4.2512784 , 2.379218  , 2.6335719 , 0.17575608,-2.7761426 ,-2.8164017 , 1.8392245 , 2.6495574 , 0.82702005, 3.8548648 ,-3.179834  , 0.25908127, 2.4930098 , 0.71019745,-3.193962  ,-1.1381371 ,-3.5847874 ,-1.3353258 , 2.942422  , 0.11944559,-3.0676606 , 3.534187  , 0.86664987,-1.4781127 , 4.8873277 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/4.txt
new file mode 100644
index 0000000..aeecd56
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/uint8/4.txt
@@ -0,0 +1 @@
+ 4.2327642 , 4.644095  ,-2.8978996 , 4.39419   , 2.897952  ,-3.330613  ,-3.9131684 ,-1.4672462 ,-3.9219787 , 2.1286428 ,-4.313653  , 2.65426   ,-4.201722  , 2.5390174 ,-3.821772  ,-1.9420135 , 3.3508427 ,-1.2804624 , 4.899826  ,-4.165279  ,-0.38920662, 3.594253  ,-2.367396  , 3.8604352 , 0.40077925, 3.7654843 ,-2.7208197 , 3.4325044 ,-2.921729  , 2.0519714 ,-0.6181836 ,-0.12342291,-4.1059036 ,-3.653849  ,-3.5340316 ,-0.2782715 , 0.32330513, 3.360021  , 2.5673623 , 2.1614027 ,-4.438277  , 3.3010736 , 0.3992392 , 0.82871836,-2.8720777 , 0.29633927, 0.25286415,-4.191315  
diff --git a/compiler/pota-quantization-value-test/test_record_minmax.sh b/compiler/pota-quantization-value-test/test_record_minmax.sh
index acb7574..fa8f506 100755
--- a/compiler/pota-quantization-value-test/test_record_minmax.sh
+++ b/compiler/pota-quantization-value-test/test_record_minmax.sh
@@ -9,11 +9,11 @@
 # work_dir : build directory of quantization-value-test (ex: build/compiler/quantization-value-test)
 
 SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-GEN_SCRIPT_PATH="${SOURCE_PATH}/gen_h5_explicit_inputs.py"
 COMPARE_SCRIPT_PATH="${SOURCE_PATH}/compare_tensors.py"
 CONFIG_PATH="$1"; shift
 BIN_PATH=$(dirname "${CONFIG_PATH}")
 TEST_INPUT_PATH="${SOURCE_PATH}/test_inputs"
+GEN_SCRIPT_PATH="${BIN_PATH}/gen_h5_explicit_inputs.py"
 WORKDIR="$1"; shift
 
 source "${CONFIG_PATH}"
@@ -48,7 +48,7 @@ while [ "$1" != "" ]; do
     # Generate h5 input data
     source "${VIRTUALENV}/bin/activate"
     "${VIRTUALENV}/bin/python" "${GEN_SCRIPT_PATH}" \
-      --model "${WORKDIR}/${MODELNAME}.tflite" \
+      --model "${WORKDIR}/${MODELNAME}.circle" \
       --input "${TEST_INPUT_PATH}/${MODELNAME}/${GRANULARITY}/${DTYPE}" \
       --output "${TESTCASE_FILE}.input.h5"
 
diff --git a/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgMax.cpp b/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgMax.cpp
index b1c92ec..13bf2e5 100644
--- a/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgMax.cpp
+++ b/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgMax.cpp
@@ -65,13 +65,13 @@ MaxPoolWithArgMaxChef::custom_value(flatbuffers::FlatBufferBuilder &fbb) const
   flex_buffers->Add(1);
   flex_buffers->EndVector(start, /*typed=*/true, /*fixed=*/false);
   auto output_type = operation.max_pool_with_argmax_options().output_type();
-  assert(output_type == tflite::TensorType_INT64 || output_type == tflite::TensorType_INT32);
+  assert(output_type == tflchef::INT64 || output_type == tflchef::INT32);
   flex_buffers->Int("Targmax", output_type);
   std::string padding = operation.max_pool_with_argmax_options().padding() ? "VALID" : "SAME";
   flex_buffers->String("padding", padding);
   flex_buffers->Bool("include_batch_in_index",
                      operation.max_pool_with_argmax_options().include_batch_in_index());
-  flex_buffers->Int("T", tflite::TensorType_FLOAT32);
+  flex_buffers->Int("T", tflchef::FLOAT32);
   flex_buffers->EndMap(map_start);
   flex_buffers->Finish();
 
diff --git a/compiler/tfldump/src/Dump.cpp b/compiler/tfldump/src/Dump.cpp
index 8c8178f..20e1343 100644
--- a/compiler/tfldump/src/Dump.cpp
+++ b/compiler/tfldump/src/Dump.cpp
@@ -349,6 +349,7 @@ void dump_model(std::ostream &os, const tflite::Model *model)
 
   auto opcodes = reader.opcodes();
   auto buffers = reader.buffers();
+  auto metadata = reader.metadata();
 
   // dump operator_codes
   os << "Operator Codes: [order] OpCodeName (OpCode Enum)" << std::endl;
@@ -382,6 +383,17 @@ void dump_model(std::ostream &os, const tflite::Model *model)
   }
   os << std::endl;
 
+  // dump metadata
+  if (metadata != nullptr)
+  {
+    os << "metadata : B(index) name" << std::endl;
+    for (uint32_t i = 0; i < metadata->Length(); ++i)
+    {
+      os << "B(" << metadata->Get(i)->buffer() << ") " << metadata->Get(i)->name()->c_str();
+    }
+    os << std::endl;
+  }
+
   for (uint32_t sg = 0; sg < num_subgraph; ++sg)
   {
     reader.select_subgraph(sg);
diff --git a/compiler/tfldump/src/OpPrinter.cpp b/compiler/tfldump/src/OpPrinter.cpp
index 5d279632..c358480 100644
--- a/compiler/tfldump/src/OpPrinter.cpp
+++ b/compiler/tfldump/src/OpPrinter.cpp
@@ -694,6 +694,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   // There is no Option for LOGISTIC
   // There is no Option for LOG_SOFTMAX
   _op_map[tflite::BuiltinOperator_MAX_POOL_2D] = make_unique<Pool2DPrinter>();
+  _op_map[tflite::BuiltinOperator_MEAN] = make_unique<ReducerPrinter>();
   _op_map[tflite::BuiltinOperator_MIRROR_PAD] = make_unique<MirrorPadPrinter>();
   _op_map[tflite::BuiltinOperator_MUL] = make_unique<MulPrinter>();
   // There is no Option for NON_MAX_SUPPRESSION_V4
diff --git a/compiler/tfldump/src/Read.cpp b/compiler/tfldump/src/Read.cpp
index f9782d9..856cc56 100644
--- a/compiler/tfldump/src/Read.cpp
+++ b/compiler/tfldump/src/Read.cpp
@@ -81,6 +81,7 @@ Reader::Reader(const tflite::Model *model)
   _version = model->version();
   _subgraphs = model->subgraphs();
   _buffers = model->buffers();
+  _metadata = model->metadata();
 
   auto opcodes = model->operator_codes();
   for (const ::tflite::OperatorCode *opcode : *opcodes)
diff --git a/compiler/tfldump/src/Read.h b/compiler/tfldump/src/Read.h
index 7af2fa5..f835be1 100644
--- a/compiler/tfldump/src/Read.h
+++ b/compiler/tfldump/src/Read.h
@@ -52,6 +52,7 @@ private:
   using TFliteBuffers_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>;
   using TFliteTensors_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>;
   using TFliteOperators_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>;
+  using TFliteMetadata_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>;
 
 public:
   Reader(const tflite::Model *model);
@@ -67,6 +68,7 @@ public:
   const TFliteOperators_t *operators() { return _operators; }
   const std::vector<int32_t> &inputs() const { return _inputs; }
   const std::vector<int32_t> &outputs() const { return _outputs; }
+  const TFliteMetadata_t *metadata() const { return _metadata; }
 
   uint32_t num_subgraph() const { return _subgraphs->Length(); }
 
@@ -86,6 +88,7 @@ private:
   const TFliteBuffers_t *_buffers{nullptr};
   const TFliteTensors_t *_tensors{nullptr};
   const TFliteOperators_t *_operators{nullptr};
+  const TFliteMetadata_t *_metadata{nullptr};
 
   uint32_t _subgraph_index;
   std::string _subgraph_name;
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
index 9055154..595bbfd 100644
--- a/compiler/vconone/CMakeLists.txt
+++ b/compiler/vconone/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT VCONONE_VERSION)
-  set(VCONONE_VERSION 0x00000000000b0001)
+  set(VCONONE_VERSION 0x00000000000c0001)
   # NOTE order is [build patch minor major]
   # if VCONONE_VERSION is set with -D option, it will be cached
   # you may have to remove cache file if you remove -D option
diff --git a/compute/.clang-format b/compute/.clang-format
new file mode 120000
index 0000000..0ff66f3
--- /dev/null
+++ b/compute/.clang-format
@@ -0,0 +1 @@
+../.clang-format.8
\ No newline at end of file
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index d29886a..4a37178 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -255,14 +255,14 @@ private:
   cl::Device _device;       /**< Underlying CL device. */
   std::string _kernel_path; /**< Path to the kernels folder. */
   mutable std::map<std::string, const Program>
-      _programs_map; /**< Map with all already loaded program data. */
+    _programs_map; /**< Map with all already loaded program data. */
   mutable std::map<std::string, cl::Program>
-      _built_programs_map; /**< Map with all already built program data. */
+    _built_programs_map; /**< Map with all already built program data. */
   static const std::map<std::string, std::string>
-      _kernel_program_map; /**< Map that associates kernel names with programs. */
+    _kernel_program_map; /**< Map that associates kernel names with programs. */
   static const std::map<std::string, std::string>
-      _program_source_map; /**< Contains sources for all programs.
-                                Used for compile-time kernel inclusion. >*/
+    _program_source_map; /**< Contains sources for all programs.
+                           Used for compile-time kernel inclusion. >*/
 };
 }
 #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index a614d52..fb689f7 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -54,8 +54,8 @@ namespace arm_compute
 class ICLTensor;
 
 /**
-* @brief Class to perform EmbeddingLookup operation with opencl kernel
-*/
+ * @brief Class to perform EmbeddingLookup operation with opencl kernel
+ */
 class CLEmbeddingLookupKernel : public ICLKernel
 {
 public:
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 99cfa61..96f8308 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -55,8 +55,8 @@ namespace arm_compute
 class ICLTensor;
 
 /**
-* @brief Class to perform HashtableLookup operation with opencl kernel
-*/
+ * @brief Class to perform HashtableLookup operation with opencl kernel
+ */
 class CLHashtableLookupKernel : public ICLKernel
 {
 public:
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
index 99bb351..963d7b8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -68,34 +68,37 @@ public:
   const char *name() const override { return "NEOneHotKernel"; }
   /** Initialise the kernel's inputs and outputs
    *
- * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up to
- * 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same
- * as @p on_value
- * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   *                       following types: U32/S32
+   * @param[in]  depth     The tensor for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around.
+   *                       Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    */
   void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                  const ITensor *off_value, ITensor *output, int axis = -1);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
+   * NEOneHotKernel
    *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  depth     The tensor info for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
index 1e69f09..2aaab6b 100644
--- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -72,10 +72,10 @@ namespace shape_calculator
  * @return the calculated shape
  */
 inline TensorShape compute_transposeconv_upsampled_shape(
-    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
-    std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
-    unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
-    unsigned int &pad_top, unsigned int &pad_bottom)
+  const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+  std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+  unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+  unsigned int &pad_top, unsigned int &pad_bottom)
 {
   unsigned int sx = info.stride().first;
   unsigned int sy = info.stride().second;
@@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape(
 
   unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
   unsigned int pady_all_except_invallid =
-      pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+    pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
   pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
   pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
   pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
@@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &
   const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   const int channel_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
   const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
   TensorShape out_shape{input_shape};
@@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int
   const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   const int idx_channel =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
   TensorShape output_shape{input->tensor_shape()};
   output_shape.set(idx_width, input->dimension(idx_width) * block);
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
index 409eaf5..026209f 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -106,22 +106,24 @@ public:
   CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension.
-   *                             Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
- * is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Should match @p input data type,
+   *                                except for input of QASYMM8 and QASYMM8_SIGNED type
+   *                                where biases should be of S32 type
+   * @param[out]    output          Output tensor.
+   *                                The output has the same number of dimensions as the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with
+   *                @ref CLWeightsReshapeKernel.
    *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -130,23 +132,24 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                 and an optional 4th dimension for batch of inputs.
    *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
    * @param[in]     bias            (Optional) The biases have one dimension.
    *                                Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   *                                input of QASYMM8 and QASYMM8_SIGNED type
+   *                                where biases should be of S32 type
    * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
+   *                                the @p input.
    * @param[in]     info            Contains padding and policies to be used in the deconvolution,
- * this is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -154,24 +157,26 @@ public:
                  unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
- * CLDirectTransposeConvLayer
+   * CLDirectTransposeConvLayer
    *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension.
-   *                         Data type supported: Should match @p input data type, except for input
- * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data type supported: Should match @p input data type,
+   *                            except for input of QASYMM8 and QASYMM8_SIGNED type
+   *                            where biases should be of S32 type
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] info            Contains padding and policies to be used in the deconvolution,
+   *                            this is decribed in @ref PadStrideInfo.
+   * @param[in] invalid_right   The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped
+   *                            with @ref CLWeightsReshapeKernel.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index e65a646..f27e991 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -216,7 +216,7 @@ private:
   CLConvertFullyConnectedWeights _convert_weights;
   weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
   weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
-      _reshape_weights_managed_function;
+    _reshape_weights_managed_function;
   CLFlattenLayer _flatten_layer;
   CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
   CLGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
index 289ab16..bdb1686 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
 
 public:
   CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
-        _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
+    : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+      _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
   {
     // DO NOTHING
   }
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index b01ec42..167554c 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -66,7 +66,7 @@ public:
    * @param[out] output  The output tensor, Data types supported: same as @p input.
    * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
    * @return N/A
- */
+   */
   void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
 
   /**
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 5fb102e..5b27d36 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -63,20 +63,22 @@ public:
 
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
- * as @p input.
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
- * is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions
+   *                                as the @p input.
+   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
+   *                                this is described in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -85,22 +87,22 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs. Data types supported:
- * QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
-   * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions
+   *                                as the @p input.
    * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
- * this is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+   *                                this is described in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -108,22 +110,24 @@ public:
                  unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayer
+   * CLTransposeConvLayer
    *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
- * @p input.
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
- * described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data type supported: Same as @p input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] deconv_info     Contains padding and policies to be used in the deconvolution,
+   *                            this is described in @ref PadStrideInfo.
+   * @param[in] invalid_right   The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped with
+   *                            @ref CLWeightsReshapeKernel.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
index 18cb61b..e34b4dc 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
 
 public:
   NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
-        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+    : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+      _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
   {
     // DO NOTHING
   }
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
index b2ea627..1a68f80 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -66,19 +66,20 @@ public:
   void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                  const ITensor *off_value, ITensor *output, int axis = -1);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
+   * NEOneHotKernel
    *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  depth     The tensor info for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 24ff5da..7a08dae 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -110,39 +110,42 @@ public:
 
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
- * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
- * for F16 input.
-   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
- * input.
-   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in]     invalid_right  The number of zeros added to right edge of the output.
- * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            Optional, ignored if NULL. The biases have one dimension.
+   *                                Data type supported: Data types supported: S32 for QASYMM8 and
+   * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+   *                                the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    */
   void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
                  const PadStrideInfo &info, unsigned int invalid_right,
                  unsigned int invalid_bottom);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NETransposeConvLayer
+   * NETransposeConvLayer
    *
-   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
- * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
- * input.
-   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input,
+   *                                                  F32 for F32 input, F16 for F16 input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions as
+   *                            the @p input.
+   * @param[in] info            Contains padding and policies to be used in the deconvolution,
+   *                            this is decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 81d0cb7..1a8ff3e 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -54,123 +54,123 @@
 using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
-    // ARMComputeEx kernels
-    {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
-    {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast_bool", "cast.cl"},
-    {"embedding_lookup", "embedding_lookup.cl"},
-    {"gather_ex", "gather_ex.cl"},
-    {"gather_ex_1d", "gather_ex.cl"},
-    {"gather_ex_1d_out", "gather_ex.cl"},
-    {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
-    {"hashtable_lookup", "hashtable_lookup.cl"},
-    {"instance_normalization_ex", "instance_normalization_ex.cl"},
-    {"multiply_scale_factor", "multiply_scale_factor.cl"},
-    {"neg_tensor", "neg_tensor.cl"},
-    {"one_hot", "one_hot.cl"},
-    {"one_hot_only_on_value", "one_hot.cl"},
-    {"quantization_symm8", "quantization_symm8.cl"},
-    {"reduce_min_max", "reduce_operation.cl"},
-    {"reduce_sum_mean", "reduce_operation.cl"},
-    {"topkv2_init", "topkv2.cl"},
-    {"topkv2_find_first_negative", "topkv2.cl"},
-    {"topkv2_reorder_negatives", "topkv2.cl"},
-    {"topkv2_store", "topkv2.cl"},
-    {"radixsort_histogram", "topkv2_radixsort.cl"},
-    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
-    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
-    {"radixsort_reorder", "topkv2_radixsort.cl"},
-    {"topkv2_quicksort", "topkv2_quicksort.cl"},
-    {"scale_factor_symm8", "scale_factor.cl"},
+  // ARMComputeEx kernels
+  {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
+  {"binary_logical_op", "binary_logical_op.cl"},
+  {"cast_bool", "cast.cl"},
+  {"embedding_lookup", "embedding_lookup.cl"},
+  {"gather_ex", "gather_ex.cl"},
+  {"gather_ex_1d", "gather_ex.cl"},
+  {"gather_ex_1d_out", "gather_ex.cl"},
+  {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+  {"hashtable_lookup", "hashtable_lookup.cl"},
+  {"instance_normalization_ex", "instance_normalization_ex.cl"},
+  {"multiply_scale_factor", "multiply_scale_factor.cl"},
+  {"neg_tensor", "neg_tensor.cl"},
+  {"one_hot", "one_hot.cl"},
+  {"one_hot_only_on_value", "one_hot.cl"},
+  {"quantization_symm8", "quantization_symm8.cl"},
+  {"reduce_min_max", "reduce_operation.cl"},
+  {"reduce_sum_mean", "reduce_operation.cl"},
+  {"topkv2_init", "topkv2.cl"},
+  {"topkv2_find_first_negative", "topkv2.cl"},
+  {"topkv2_reorder_negatives", "topkv2.cl"},
+  {"topkv2_store", "topkv2.cl"},
+  {"radixsort_histogram", "topkv2_radixsort.cl"},
+  {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+  {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+  {"radixsort_reorder", "topkv2_radixsort.cl"},
+  {"topkv2_quicksort", "topkv2_quicksort.cl"},
+  {"scale_factor_symm8", "scale_factor.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
-    {
-        "arg_min_max_ex.cl",
+  {
+    "arg_min_max_ex.cl",
 #include "./cl_kernels/arg_min_max_ex.clembed"
-    },
-    {
-        "cast.cl",
+  },
+  {
+    "cast.cl",
 #include "./cl_kernels/cast.clembed"
-    },
-    {
-        "embedding_lookup.cl",
+  },
+  {
+    "embedding_lookup.cl",
 #include "./cl_kernels/embedding_lookup.clembed"
-    },
-    {
-        "gather_ex.cl",
+  },
+  {
+    "gather_ex.cl",
 #include "./cl_kernels/gather_ex.clembed"
-    },
-    {
-        "gemmlowp_ex.cl",
+  },
+  {
+    "gemmlowp_ex.cl",
 #include "./cl_kernels/gemmlowp_ex.clembed"
-    },
-    {
-        "hashtable_lookup.cl",
+  },
+  {
+    "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
-    },
-    {
-        "helpers.h",
+  },
+  {
+    "helpers.h",
 #include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
+  },
+  {
+    "helpers_asymm.h",
 #include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "instance_normalization_ex.cl",
+  },
+  {
+    "instance_normalization_ex.cl",
 #include "./cl_kernels/instance_normalization_ex.clembed"
-    },
-    {
-        "binary_logical_op.cl",
+  },
+  {
+    "binary_logical_op.cl",
 #include "./cl_kernels/binary_logical_op.clembed"
-    },
-    {
-        "multiply_scale_factor.cl",
+  },
+  {
+    "multiply_scale_factor.cl",
 #include "./cl_kernels/multiply_scale_factor.clembed"
-    },
-    {
-        "neg_tensor.cl",
+  },
+  {
+    "neg_tensor.cl",
 #include "./cl_kernels/neg_tensor.clembed"
-    },
-    {
-        "one_hot.cl",
+  },
+  {
+    "one_hot.cl",
 #include "./cl_kernels/one_hot.clembed"
-    },
-    {
-        "quantization_symm8.cl",
+  },
+  {
+    "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
-    },
-    {
-        "reduce_operation.cl",
+  },
+  {
+    "reduce_operation.cl",
 #include "./cl_kernels/reduce_operation.clembed"
-    },
-    {
-        "scale_factor.cl",
+  },
+  {
+    "scale_factor.cl",
 #include "./cl_kernels/scale_factor.clembed"
-    },
-    {
-        "topkv2.cl",
+  },
+  {
+    "topkv2.cl",
 #include "./cl_kernels/topkv2.clembed"
-    },
-    {
-        "topkv2_radixsort.cl",
+  },
+  {
+    "topkv2_radixsort.cl",
 #include "./cl_kernels/topkv2_radixsort.clembed"
-    },
-    {
-        "topkv2_quicksort.cl",
+  },
+  {
+    "topkv2_quicksort.cl",
 #include "./cl_kernels/topkv2_quicksort.clembed"
-    },
+  },
 
 #endif /* EMBEDDED_KERNELS */
 };
 
 CLKernelLibraryEx::CLKernelLibraryEx()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+  : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
 {
   opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
                          // CLKernelLibraryEx is built
@@ -337,8 +337,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con
 
   size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
   ARM_COMPUTE_ERROR_ON_MSG(
-      err != 0,
-      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    err != 0,
+    "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
   ARM_COMPUTE_UNUSED(err);
 
   return result;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
index 0a014d1..135cacf 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
@@ -119,15 +119,15 @@ inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x
   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 
-  idx_sel.s0123 = (in.s0123 < in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  idx_sel.s0123 =
+    (in.s0123 < in.s4567) ||
+    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 
   idx_sel.s01 =
-      (in.s01 < in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    (in.s01 < in.s23) ||
+    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
   in.s01 = select(in.s23, in.s01, idx_sel.s01);
   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 
@@ -204,15 +204,15 @@ inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x
   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 
-  idx_sel.s0123 = (in.s0123 > in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  idx_sel.s0123 =
+    (in.s0123 > in.s4567) ||
+    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 
   idx_sel.s01 =
-      (in.s01 > in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    (in.s01 > in.s23) ||
+    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
   in.s01 = select(in.s23, in.s01, idx_sel.s01);
   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 
@@ -296,22 +296,21 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
   const uint x_idx = get_global_id(0);
   const uint y_idx = get_global_id(1);
   const __global DATA_TYPE *src_in_row =
-      (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
-                                   y_idx * src_step_y);
+    (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
 
   for (unsigned int y = 0; y < get_local_size(1); ++y)
   {
 #if defined(ARG_MAX)
 #if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_max_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+    local_results[lid] =
+      arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 #else  // !defined(PREV_OUTPUT)
     local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 #endif // defined(PREV_OUTPUT)
 #else  // defined(ARG_MIN)
 #if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_min_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+    local_results[lid] =
+      arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 #else  // !defined(PREV_OUTPUT)
     local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 #endif // defined(PREV_OUTPUT)
@@ -334,12 +333,12 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
         DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
 #if defined(ARG_MAX)
         condition_check3 =
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
+          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
         local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
 #else  // defined(ARG_MIN)
         local_results[lid] = select(
-            local_results[lid], local_results[lid + i],
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+          local_results[lid], local_results[lid + i],
+          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
       }
       barrier(CLK_LOCAL_MEM_FENCE);
@@ -403,7 +402,7 @@ __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)
   {
     VEC_DATA_TYPE(DATA_TYPE, 16)
     in =
-        CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+      CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
 
     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
index e249663..f8b5bbe 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI
 #if OP_CODE == 1 // LOGICAL AND
   VSTORE(VEC_SIZE)
   (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
    0, (__global DATA_TYPE *)output.ptr);
 
 #elif OP_CODE == 2 // LOGICAL OR
   VSTORE(VEC_SIZE)
   (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
    0, (__global DATA_TYPE *)output.ptr);
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index 92e5dfb..5ebc78d 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
   // lookup ids for based on the tensor dimensions
   int lup_id[4] = {0};
 
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
+  lup_id[0] =
+    (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+  lup_id[1] =
+    (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
   lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
                               : get_global_id(2) % DEPTH_OUT;
   lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
+                ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                : get_global_id(2) / DEPTH_OUT;
 
   in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
             lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
index 80ba73d..85fc09d 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
@@ -41,7 +41,7 @@
 #include "helpers.h"
 
 #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
-    defined(COLS_A)
+  defined(COLS_A)
 #define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
 #define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
 #define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
@@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
                                      ,
                                      uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                     )
+)
 {
   int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     // Load values from matrix B
     VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
     VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
-        0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
+      0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
 
     // Accumulate
     acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
@@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     // Load values from matrix B
     VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
 
     // Accumulate
     acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index a4f7dbd..3ace1fd 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
 
   int lup_id[4] = {0};
 
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
+  lup_id[0] =
+    (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+  lup_id[1] =
+    (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
   lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
                               : get_global_id(2) % DEPTH_OUT;
   lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
+                ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                : get_global_id(2) / DEPTH_OUT;
 
   if (lup_id[NUM_DIMS - 1] < 0)
   {
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index e07a25e..4a3bc13 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -49,7 +49,7 @@
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
-    defined(cl_arm_integer_dot_product_accumulate_int8)
+  defined(cl_arm_integer_dot_product_accumulate_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
        // defined(cl_arm_integer_dot_product_accumulate_int8)
@@ -288,21 +288,21 @@
 
 #define VECTOR_DECLARATION(name)                                        \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
-      uint name##_offset_first_element_in_bytes
+    uint name##_offset_first_element_in_bytes
 
 #define IMAGE_DECLARATION(name)                                                               \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_offset_first_element_in_bytes
 
 #define TENSOR3D_DECLARATION(name)                                                            \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
-      uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_stride_z, uint name##_step_z,                             \
+    uint name##_offset_first_element_in_bytes
 
 #define TENSOR4D_DECLARATION(name)                                                            \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
-      uint name##_step_w, uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,       \
+    uint name##_step_w, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
   update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
@@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
                                          uint stride_x, uint step_x)
 {
   Vector vector = {
-      .ptr = ptr,
-      .offset_first_element_in_bytes = offset_first_element_in_bytes,
-      .stride_x = stride_x,
+    .ptr = ptr,
+    .offset_first_element_in_bytes = offset_first_element_in_bytes,
+    .stride_x = stride_x,
   };
   vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
   return vector;
@@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
                .stride_x = stride_x,
                .stride_y = stride_y};
   img.ptr +=
-      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
   return img;
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index 5f1b3f9..d7f1d08 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                 \
-  inline VEC_DATA_TYPE(type, size)                                                                \
-      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
-  {                                                                                               \
-    VEC_DATA_TYPE(float, size)                                                                    \
-    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
-    VEC_DATA_TYPE(type, size)                                                                     \
-    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
-                      VEC_DATA_TYPE(type, size));                                                 \
-    return res;                                                                                   \
+#define QUANTIZE_IMPL(type, size)                                                                  \
+  inline VEC_DATA_TYPE(type, size)                                                                 \
+    quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)             \
+  {                                                                                                \
+    VEC_DATA_TYPE(float, size)                                                                     \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);  \
+    VEC_DATA_TYPE(type, size)                                                                      \
+    res =                                                                                          \
+      CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+    return res;                                                                                    \
   }
 
 /** Dequantize a vector of values to floating-point
@@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                       \
-  inline VEC_DATA_TYPE(float, size)                                                       \
-      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-  {                                                                                       \
-    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+#define DEQUANTIZE_IMPL(type, size)                                                     \
+  inline VEC_DATA_TYPE(float, size)                                                     \
+    dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                     \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
   }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
   inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
-      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)      \
   {                                                                     \
     const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
     const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
@@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Product of two fixed-point numbers.
  */
-#define ASYMM_MULT_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    overflow = a == b && a == INT_MIN;                                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    a_64 = convert_long##size(a);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    b_64 = convert_long##size(b);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    ab_64 = a_64 * b_64;                                                       \
-    /* Revert COMPMID-907 */                                                   \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask1 = 1 << 30;                                                           \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask2 = 1 - (1 << 30);                                                     \
-    VEC_DATA_TYPE(long, size)                                                  \
-    is_positive_or_zero = ab_64 >= 0;                                          \
-    VEC_DATA_TYPE(long, size)                                                  \
-    nudge = select(mask2, mask1, is_positive_or_zero);                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask = 1ll << 31;                                                          \
-    VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
-    return select(ab_x2_high32, INT_MAX, overflow);                            \
+#define ASYMM_MULT_IMPL(size)                                                \
+  inline VEC_DATA_TYPE(int, size)                                            \
+    asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                          \
+    VEC_DATA_TYPE(int, size)                                                 \
+    overflow = a == b && a == INT_MIN;                                       \
+    VEC_DATA_TYPE(long, size)                                                \
+    a_64 = convert_long##size(a);                                            \
+    VEC_DATA_TYPE(long, size)                                                \
+    b_64 = convert_long##size(b);                                            \
+    VEC_DATA_TYPE(long, size)                                                \
+    ab_64 = a_64 * b_64;                                                     \
+    /* Revert COMPMID-907 */                                                 \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask1 = 1 << 30;                                                         \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask2 = 1 - (1 << 30);                                                   \
+    VEC_DATA_TYPE(long, size)                                                \
+    is_positive_or_zero = ab_64 >= 0;                                        \
+    VEC_DATA_TYPE(long, size)                                                \
+    nudge = select(mask2, mask1, is_positive_or_zero);                       \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask = 1ll << 31;                                                        \
+    VEC_DATA_TYPE(int, size)                                                 \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                \
+    return select(ab_x2_high32, INT_MAX, overflow);                          \
   }
 
 /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
-  inline VEC_DATA_TYPE(int, size)                                                                  \
-      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
-                                                                              a)                   \
-  {                                                                                                \
-    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
-    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
-    const int k_fractional_bits = 31;                                                              \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x = a + (1 << (k_fractional_bits - 3));                                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x2 = ASYMM_MULT(x, x, size);                                                                   \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x3 = ASYMM_MULT(x2, x, size);                                                                  \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
-        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
-        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
-    return constant_term +                                                                         \
-           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                 \
+  inline VEC_DATA_TYPE(int, size)                                                                \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+                                                                          a)                     \
+  {                                                                                              \
+    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                   \
+    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                \
+    const int k_fractional_bits = 31;                                                            \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x = a + (1 << (k_fractional_bits - 3));                                                      \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x2 = ASYMM_MULT(x, x, size);                                                                 \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x3 = ASYMM_MULT(x2, x, size);                                                                \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4 = ASYMM_MULT(x2, x2, size);                                                               \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                      \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_24_plus_x3_over_6_plus_x2 =                                                          \
+      ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                   \
+      ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
+    return constant_term +                                                                       \
+           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);        \
   }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 
 #define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
   inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
-      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
-      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
+    VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,      \
+    int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                                 \
   {                                                                                            \
     if (k_integer_bits > exponent)                                                             \
     {                                                                                          \
       const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
       return ASYMM_SELECT_USING_MASK(                                                          \
-          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
-          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
+        ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                       \
+        ASYMM_MULT(result, fp_multiplier, size), result, size);                                \
     }                                                                                          \
                                                                                                \
     return result;                                                                             \
@@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
   inline VEC_DATA_TYPE(int, size)                                                                 \
-      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
+    asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)            \
   {                                                                                               \
     const int k_fractional_bits = 31 - k_integer_bits;                                            \
     VEC_DATA_TYPE(int, size)                                                                      \
@@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
     a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
     VEC_DATA_TYPE(int, size)                                                                      \
     result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
-        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
+      a_mod_quarter_minus_one_quarter_scaled, size);                                              \
     VEC_DATA_TYPE(int, size)                                                                      \
     remainder = a_mod_quarter_minus_one_quarter - a;                                              \
                                                                                                   \
@@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
                                 remainder, size);                                                 \
     result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
                                 remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
-                                size);                                                            \
     result =                                                                                      \
-        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
+      EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \
+    result =                                                                                      \
+      EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);    \
                                                                                                   \
     if (k_integer_bits > 5)                                                                       \
     {                                                                                             \
@@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
-  inline VEC_DATA_TYPE(int, size)                                                            \
-      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                          \
-    if (exponent < 0)                                                                        \
-    {                                                                                        \
-      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
-    }                                                                                        \
-                                                                                             \
-    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
-    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
-    int threshold = ((1 << (31 - exponent)) - 1);                                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    result = x << exponent;                                                                  \
-    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
-    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
-    return result;                                                                           \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                  \
+  inline VEC_DATA_TYPE(int, size)                                                          \
+    asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                        \
+    if (exponent < 0)                                                                      \
+    {                                                                                      \
+      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                            \
+    }                                                                                      \
+                                                                                           \
+    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+    int threshold = ((1 << (31 - exponent)) - 1);                                          \
+    VEC_DATA_TYPE(int, size)                                                               \
+    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+    VEC_DATA_TYPE(int, size)                                                               \
+    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+    VEC_DATA_TYPE(int, size)                                                               \
+    result = x << exponent;                                                                \
+    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+    return result;                                                                         \
   }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                                           \
-      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    a64 = convert_long##size(a);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    b64 = convert_long##size(b);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sum = a64 + b64;                                                                        \
-    const VEC_DATA_TYPE(long, size) one = 1;                                                \
-    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sign = select(minus_one, one, sum >= 0);                                                \
-    return convert_int##size((sum + sign) / 2);                                             \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                \
+  inline VEC_DATA_TYPE(int, size)                                                         \
+    asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                                       \
+    VEC_DATA_TYPE(long, size)                                                             \
+    a64 = convert_long##size(a);                                                          \
+    VEC_DATA_TYPE(long, size)                                                             \
+    b64 = convert_long##size(b);                                                          \
+    VEC_DATA_TYPE(long, size)                                                             \
+    sum = a64 + b64;                                                                      \
+    const VEC_DATA_TYPE(long, size) one = 1;                                              \
+    const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+    VEC_DATA_TYPE(long, size)                                                             \
+    sign = select(minus_one, one, sum >= 0);                                              \
+    return convert_int##size((sum + sign) / 2);                                           \
   }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
   inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a)   \
   {                                                                            \
     const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
     const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
@@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
-  inline VEC_DATA_TYPE(int, size)                                                                 \
-      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-  {                                                                                               \
-    const int left_shift = shift > 0 ? shift : 0;                                                 \
-    const int right_shift = shift > 0 ? 0 : -shift;                                               \
-    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
-                                         right_shift, size);                                      \
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                             \
+  inline VEC_DATA_TYPE(int, size)                                                               \
+    multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                             \
+    const int left_shift = shift > 0 ? shift : 0;                                               \
+    const int right_shift = shift > 0 ? 0 : -shift;                                             \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),     \
+                                         right_shift, size);                                    \
   }
 #define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
   multiply_by_quantized_multiplier##size(input, qmul, shift)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
index 0148426..96a2431 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -41,7 +41,7 @@
 #include "helpers.h"
 
 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
-    defined(DIM_Y) && defined(DIM_Z)
+  defined(DIM_Y) && defined(DIM_Z)
 /** This function normalizes the input 2D tensor across the first dimension with respect to mean and
  * standard deviation of the same dimension.
  *
@@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
                                         TENSOR4D_DECLARATION(output)
 #endif /* IN_PLACE */
 #ifdef GAMMA
-                                            ,
+                                          ,
                                         VECTOR_DECLARATION(gamma)
 #endif // GAMMA
 #ifdef BETA
-                                            ,
+                                          ,
                                         VECTOR_DECLARATION(beta)
 #endif // BETA
-                                            )
+)
 {
   Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
 #ifndef IN_PLACE
@@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (int i_h = 0; i_h < DIM_Z; ++i_h)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
 #endif /* IN_PLACE */
       *(output_address) = (*(input_address)-mean) * multip + beta;
     }
@@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
 
       VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (; x < DIM_X; ++x)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
       *(output_address) = (*(input_address)-mean) * multip + beta;
     }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
index 3943fc4..abbfbd2 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
@@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION
   (val, 0, (__global DATA_TYPE *)output.ptr);
 #else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
   *((__global DATA_TYPE *)(output.ptr)) =
-      ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
-      *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
+    ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
+    *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
index c274aba..784a8d6 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
@@ -206,16 +206,16 @@ __kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLAR
 
 #if AXIS == 0
   *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 1
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 2
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 3
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #endif // AXIS
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index 76fda90..532000e 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT
 
   // Multiply with a multiplier smaller than 1
   out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+    ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
   out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
 
   VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
index 4ae9adb..c829f26 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
@@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
 
   // Create scale vector
   const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
-      *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
+    *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
 
   // Quantize
   VEC_DATA_TYPE(int, VEC_SIZE)
@@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
   (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
 #else  //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
   *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
-      CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
-                      (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
-                  int),
-      MIN_QUANT_VAL, MAX_QUANT_VAL);
+    CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
+                  (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
+                int),
+    MIN_QUANT_VAL, MAX_QUANT_VAL);
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index 832ac12..d0ef31b 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o
   Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
   int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
+    get_global_id(0),
+    get_global_id(1),
+    get_global_id(2) % DEPTH_OUT,
+    get_global_id(2) / DEPTH_OUT,
   };
 
   DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
   for (int i = 1; i < dim; ++i)
   {
     indices[axis] = i;
@@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(
   Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
   int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
+    get_global_id(0),
+    get_global_id(1),
+    get_global_id(2) % DEPTH_OUT,
+    get_global_id(2) / DEPTH_OUT,
   };
 
   DATA_TYPE sum_value = (DATA_TYPE)0;
   for (int i = 0; i < dim; ++i)
   {
     indices[axis] = i;
-    sum_value += *(
-        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    sum_value +=
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
   }
 
 #if OP_CODE == 3 // REDUCE_SUM
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
index 047004d..45307fa 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -63,10 +63,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
+                                                       DataType::QASYMM8_SIGNED, DataType::S32,
                                                        DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
+                                    op != ReductionOperation::ARG_IDX_MIN,
                                   "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
                                   "Reduction axis greater than max number of dimensions");
@@ -101,13 +102,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
   output_shape.set(axis, 1);
   DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
   auto_init_if_empty(*output, input->clone()
-                                  ->set_tensor_shape(output_shape)
-                                  .set_data_type(output_data_type)
-                                  .reset_padding()
-                                  .set_is_resizable(true));
+                                ->set_tensor_shape(output_shape)
+                                .set_data_type(output_data_type)
+                                .reset_padding()
+                                .set_is_resizable(true));
 
-  Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input),
-                                    Steps(vector_size));
+  Window win =
+    calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size));
   bool window_changed = false;
 
   switch (axis)
@@ -137,15 +138,15 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
   }
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_tuple(err, win);
 }
 } // namespace
 
 CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
-    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
-      _op(ReductionOperation::ARG_IDX_MAX)
+  : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
+    _op(ReductionOperation::ARG_IDX_MAX)
 {
 }
 
@@ -155,11 +156,11 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
-                         output->info(), axis, op));
+    validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
+                       output->info(), axis, op));
   auto win_config = validate_and_configure_window(
-      input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
-      op);
+    input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
+    op);
   ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
   _input = input;
@@ -213,7 +214,7 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor
       ARM_COMPUTE_ERROR("Not supported");
   }
   _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
+    "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
 
   // Configure kernel window
   ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
@@ -225,8 +226,8 @@ Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITenso
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
-      output->clone().get(), axis, op)));
+    input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
+    output->clone().get(), axis, op)));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index fbc76f5..ffa2c5a 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -55,7 +55,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
                            const ITensorInfo *output)
 {
   const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+    TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
 
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
@@ -68,15 +68,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
                                                          DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
+      detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+      "Wrong shape for output");
   }
   return Status{};
 }
 } // namespace
 
 CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+  : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -111,13 +111,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
 
   build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
   build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
 
   const ValidRegion &valid_region = broadcast_pair.second;
 
@@ -130,8 +130,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
   AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
   update_window_and_padding(win_input1, input1_access) ||
-      update_window_and_padding(win_input2, input2_access) ||
-      update_window_and_padding(win, output_access);
+    update_window_and_padding(win_input2, input2_access) ||
+    update_window_and_padding(win, output_access);
 
   output_access.set_valid_region(win, valid_region);
 
@@ -151,7 +151,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
   if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
   {
     can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+      (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
     for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
     {
       can_collapse = (in_shape1[d] == in_shape2[d]);
@@ -160,13 +160,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
 
   bool has_collapsed = false;
   Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
+    can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                 : window;
 
   const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
   const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+    has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
 
   Window slice = collapsed.first_slice_window_3D();
   Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
@@ -189,9 +189,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
 BorderSize CLBinaryLogicalOpKernel::border_size() const
 {
   const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    _output->info()->dimension(0) -
+    std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
   const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
   return BorderSize(0, border, 0, 0);
 }
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
index 6e0bcde..3f2ae35 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -103,7 +103,7 @@ void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
   // Create kernel
   const std::string kernel_name = "cast_bool";
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
 
   // Configure kernel
   ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 67aaf2d..e4c617c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -61,14 +61,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   input_access.set_valid_region(win, output->valid_region());
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_pair(err, win);
 }
 } // namespace
 
 CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+  : _input(nullptr), _output(nullptr), _lookups(nullptr)
 {
 }
 
@@ -77,8 +77,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
@@ -108,8 +108,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu
   build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(input->info(), output->info());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3bfe3e4..8b58852 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -62,15 +62,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
   ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   if (output->total_size() != 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), actual_axis);
+      input->tensor_shape(), indices->tensor_shape(), actual_axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
 
@@ -86,7 +86,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
   std::unique_ptr<ITensorInfo> output_info = input->clone();
   output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->tensor_shape(), indices->tensor_shape(), actual_axis));
+    input->tensor_shape(), indices->tensor_shape(), actual_axis));
   // Output auto initialization if not yet initialized
   auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
 
@@ -100,7 +100,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLGatherExKernel::CLGatherExKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+  : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
 }
 
@@ -109,11 +109,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), indices->info(), output->info(), axis));
+    validate_arguments(input->info(), indices->info(), output->info(), axis));
 
   // Configure kernel window
   auto win_config =
-      validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+    validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
   ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
   _input = input;
@@ -133,7 +133,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
   ICLKernel::configure_internal(win_config.second);
 }
 
@@ -144,7 +144,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                             indices->clone().get(),
                                                             output->clone().get(), axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 930e7c9..f0a761b 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -61,8 +61,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   input_access.set_valid_region(win, output->valid_region());
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_pair(err, win);
 }
 } // namespace
@@ -78,8 +78,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
@@ -102,7 +102,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
                                         const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+    validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
 
   _lookups = lookups;
   _keys = keys;
@@ -113,7 +113,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   // Make _lookup_indices tensor
   _lookup_indices = support::cpp14::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
-      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+    TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
 
   // Set kernel build options
@@ -127,8 +127,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(input->info(), output->info());
@@ -148,7 +148,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
 
   // Set values of hits
   const int32_t *lookups_buf =
-      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+    reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
   const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
   uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
   int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index 61c14d2..dab6480 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
-      _run_in_place(false)
+  : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+    _run_in_place(false)
 {
 }
 
@@ -132,7 +132,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -147,7 +147,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 6b27c99..1d4b141 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -99,7 +99,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu
 } // namespace
 
 CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
 {
 }
 
@@ -108,7 +108,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -123,9 +123,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
   Window win = calculate_max_window(*output->info());
   if (multi_access_x)
   {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
   }
   ICLKernel::configure_internal(win);
 
@@ -134,11 +134,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
   build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
   build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
   build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    multi_access_x, "-DLAST_ACCESSED_X=" +
+                      support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
 
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
 }
 
 Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
@@ -147,7 +147,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 643c8b1..ee633d4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -80,9 +80,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
   std::set<std::string> build_opts;
   build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
   build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
 
   // Configure window
   Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
index 35d70d6..0b8e7cc 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -65,7 +65,7 @@ inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-        indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
   return Status{};
@@ -79,7 +79,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
   // Output auto initialization if not yet initialized
   TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+    indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
   auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
   // Create window
   Window win = calculate_max_window(*output, Steps());
@@ -88,8 +88,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
 }
 } // namespace
 CLOneHotKernel::CLOneHotKernel()
-    : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
-      _is_off_value_memset(false)
+  : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+    _is_off_value_memset(false)
 {
 }
 void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
@@ -114,10 +114,10 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
                                       ICLTensor *output, int depth, int axis)
 {
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
+    validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
   // Configure kernel window
   auto win_config =
-      validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
+    validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
   ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
   if (_is_off_value_memset)
   {
@@ -131,7 +131,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
   // Set build options
   CLBuildOptions build_opts;
   build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
-                                             data_size_from_type(on_value->info()->data_type())));
+                                           data_size_from_type(on_value->info()->data_type())));
   build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
   build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
   build_opts.add_option("-DOUTPUT_DIM_Z=" +
@@ -139,7 +139,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
   // Create kernel
   const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
   ICLKernel::configure_internal(win_config.second);
 }
 Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
@@ -153,7 +153,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
                                                             on_value->clone().get(),
                                                             output->clone().get(), depth, axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
@@ -163,7 +163,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
                                                             on_value->clone().get(),
                                                             output->clone().get(), depth, axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 1a7a18c..b417a71 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -87,9 +87,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
   if (multi_access_x)
   {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
   }
 
   Coordinates coord;
@@ -101,7 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
 {
 }
 
@@ -110,7 +110,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -132,11 +132,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
   build_opts.add_option("-DDATA_TYPE_OUT=" +
                         get_cl_type_from_data_type(output->info()->data_type()));
   build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    multi_access_x,
+    "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
 
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
 }
 
 Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
@@ -145,7 +145,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 3fbebf2..3906009 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -145,7 +145,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
 
   // Create kernel
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   // Configure  kernel window
   Window win = calculate_max_window(*output_info, Steps());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 8d8853c..4a63744 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
   output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_tuple(err, win);
 }
 } // namespace
@@ -115,7 +115,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
+    CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
 
   auto win_config = validate_and_configure_window(input->info(), output->info());
 
@@ -128,7 +128,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index dfe5d59..c88bef6 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -53,12 +53,12 @@ namespace
 using namespace arm_compute;
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
 void elementwise_op_templ(
-    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
-                          OutputScalarType *, const bool),
-    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
-                     OutputScalarType *))
+  const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+  OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+  int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
+                        OutputScalarType *, const bool),
+  int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
+                   OutputScalarType *))
 {
   // Create input windows
   Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -88,26 +88,26 @@ void elementwise_op_templ(
     Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
     Iterator output(out, win);
 
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto non_broadcast_input_ptr =
-                              reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-                          const InputScalarType broadcast_value =
-                              *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-                          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
-                                                    non_broadcast_input_ptr, broadcast_value,
-                                                    output_ptr, !is_broadcast_input_2);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(non_broadcast_input_ptr + x);
-                            *(output_ptr + x) =
-                                (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
-                                               !is_broadcast_input_2 ? a : broadcast_value);
-                          }
-                        },
-                        broadcast_input, non_broadcast_input, output);
+    execute_window_loop(
+      win,
+      [&](const Coordinates &) {
+        auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+        const auto non_broadcast_input_ptr =
+          reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+        const InputScalarType broadcast_value =
+          *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+        int x =
+          (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                            broadcast_value, output_ptr, !is_broadcast_input_2);
+        for (; x < window_end_x; ++x)
+        {
+          const auto a = *(non_broadcast_input_ptr + x);
+          *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                             !is_broadcast_input_2 ? a : broadcast_value);
+        }
+      },
+      broadcast_input, non_broadcast_input, output);
   }
   else
   {
@@ -119,24 +119,23 @@ void elementwise_op_templ(
     Iterator input2(in2, input2_win);
     Iterator output(out, win);
 
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto input1_ptr =
-                              reinterpret_cast<const InputScalarType *>(input1.ptr());
-                          const auto input2_ptr =
-                              reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
-                                               input1_ptr, input2_ptr, output_ptr);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(input1_ptr + x);
-                            const auto b = *(input2_ptr + x);
-                            *(output_ptr + x) = (*scalar_func)(a, b);
-                          }
-                        },
-                        input1, input2, output);
+    execute_window_loop(
+      win,
+      [&](const Coordinates &) {
+        auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+        const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+        int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr,
+                             output_ptr);
+        for (; x < window_end_x; ++x)
+        {
+          const auto a = *(input1_ptr + x);
+          const auto b = *(input2_ptr + x);
+          *(output_ptr + x) = (*scalar_func)(a, b);
+        }
+      },
+      input1, input2, output);
   }
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index 32d7d62..a8464af 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -103,8 +103,10 @@ template <BinaryLogicalOperation op>
 inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
 {
   uint8x16x4_t out = {{
-      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
-      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+    elementwise_logic_op<op>(a.val[0], b.val[0]),
+    elementwise_logic_op<op>(a.val[1], b.val[1]),
+    elementwise_logic_op<op>(a.val[2], b.val[2]),
+    elementwise_logic_op<op>(a.val[3], b.val[3]),
   }};
   return out;
 }
@@ -160,8 +162,8 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 }
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
-    const ITensor *input1, const ITensor *input2, ITensor *output,
-    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+  const ITensor *input1, const ITensor *input2, ITensor *output,
+  std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
 {
   std::string function_to_call("op_");
   function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
@@ -184,8 +186,8 @@ std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
 configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
   static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+    {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
 
   return configure_func(input1, input2, output, map_function);
 }
@@ -223,7 +225,7 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
 
   const TensorShape out_shape =
-      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                   "Inputs are not broadcast compatible");
@@ -232,8 +234,8 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp
   if (output.total_size() > 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-        "Wrong shape for output");
+      detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+      "Wrong shape for output");
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
index 12017e5..f935596 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -129,125 +129,125 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
     case DataType::S8:
     {
       /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8(
-                                                           texels_u8, vdupq_n_u8(true_val))));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
+      execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            vst1q_s8(output_ptr + x,
+                     vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val))));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::S16:
     {
       /* Up-conversion U8 -> S16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s16(output_ptr + x, texels.val[0]);
-              vst1q_s16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+            vst1q_s16(output_ptr + x, texels.val[0]);
+            vst1q_s16(output_ptr + x + 8, texels.val[1]);
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::S32:
     {
       /* Up-conversion U8 -> S32 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-              vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+            vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+            vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+            vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+            vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::F32:
     {
       /* Up-conversion U8 -> F32 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-              vst1q_f32(output_ptr + x + 12,
-                        vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-              *(output_ptr + x) = static_cast<float>(in);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+            vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+            vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+            vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+            vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+            *(output_ptr + x) = static_cast<float>(in);
+          }
+        },
+        input, output);
       break;
     }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -255,86 +255,87 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
     {
       /* Up-conversion U8 -> F16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
-              vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+            vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
+            vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     case DataType::U8:
     {
       /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
+      execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::U16:
     {
       /* Up-conversion U8 -> U16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
-                                            vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
-
-              vst1q_u16(output_ptr + x, texels.val[0]);
-              vst1q_u16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
+                                          vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
+
+            vst1q_u16(output_ptr + x, texels.val[0]);
+            vst1q_u16(output_ptr + x + 8, texels.val[1]);
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     default:
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index 091d38c..e3a77c6 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -50,7 +50,7 @@
 using namespace arm_compute;
 
 NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
-    : _input(nullptr), _lookups(nullptr), _output(nullptr)
+  : _input(nullptr), _lookups(nullptr), _output(nullptr)
 {
 }
 
@@ -79,8 +79,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
 
   ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
@@ -119,16 +119,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
   {
     Iterator output_it(_output, out_slice);
 
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const int32_t lookup = *reinterpret_cast<int32_t *>(
-                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
-                          Coordinates input_id{id};
-                          input_id.set(lookup_dim, lookup);
-                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                 _output->info()->dimension(0) * _output->info()->element_size());
-                        },
-                        output_it);
+    execute_window_loop(
+      out_slice,
+      [&](const Coordinates &id) {
+        const int32_t lookup =
+          *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+        Coordinates input_id{id};
+        input_id.set(lookup_dim, lookup);
+        memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+               _output->info()->dimension(0) * _output->info()->element_size());
+      },
+      output_it);
 
   } while (window.slide_window_slice_4D(out_slice));
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index 93963a5..c9f0799 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -71,7 +71,7 @@ template <typename U> void validate_indices(const ITensor *indices)
 } // namespace
 
 NEGatherKernelEx::NEGatherKernelEx()
-    : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
+  : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
 {
 }
 
@@ -85,36 +85,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn
 
   Iterator output_it(_output, window);
   execute_window_loop(
-      window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
-            break;
-          case 2:
-            new_index =
-                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
-            break;
-          case 3:
-            new_index = *(
-                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(0, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
+    window,
+    [&](const Coordinates &id) {
+      Coordinates gather_id(id);
+      gather_id.collapse(_indices_rank);
+
+      U new_index;
+      switch (_indices_rank)
+      {
+        case 1:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+          break;
+        case 2:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+          break;
+        case 3:
+          new_index =
+            *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+          break;
+        default:
+          ARM_COMPUTE_ERROR("Wrong num of dimensions");
+          break;
+      }
+
+      gather_id.set(0, new_index);
+
+      std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                  output_it.ptr());
+    },
+    output_it);
 }
 
 template <typename U>
@@ -130,37 +129,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf
 
   Iterator output_it(_output, output_window);
   execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank, _axis);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
-            break;
-          case 2:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
-            break;
-          case 3:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(_axis, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id),
-                    _input->info()->dimension(0) * _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
+    output_window,
+    [&](const Coordinates &id) {
+      Coordinates gather_id(id);
+      gather_id.collapse(_indices_rank, _axis);
+
+      U new_index;
+      switch (_indices_rank)
+      {
+        case 1:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+          break;
+        case 2:
+          new_index = *(
+            reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+          break;
+        case 3:
+          new_index = *(reinterpret_cast<U *>(
+            _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+          break;
+        default:
+          ARM_COMPUTE_ERROR("Wrong num of dimensions");
+          break;
+      }
+
+      gather_id.set(_axis, new_index);
+
+      std::copy_n(_input->ptr_to_element(gather_id),
+                  _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
+    },
+    output_it);
 }
 
 void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
@@ -170,8 +168,8 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
   ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   _input = input;
   _indices = indices;
@@ -217,7 +215,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
   }
   // Output auto initialization if not yet initialized
   TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
   auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
   // Create window
@@ -243,15 +241,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i
   ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
   ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   if (output->total_size() != 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), axis);
+      input->tensor_shape(), indices->tensor_shape(), axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 30787c0..52b40e7 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -57,7 +57,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF;
 } // namespace
 
 NEHashtableLookupKernel::NEHashtableLookupKernel()
-    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+  : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
 {
 }
 
@@ -66,7 +66,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+    validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
 
   _lookups = lookups;
   _keys = keys;
@@ -92,8 +92,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
 
@@ -134,8 +134,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
 
   const size_t lookup_dim = _output->info()->num_dimensions() - 1;
   const int const_0 = _output->info()->data_type() == DataType::QASYMM8
-                          ? _output->info()->quantization_info().uniform().offset
-                          : 0;
+                        ? _output->info()->quantization_info().uniform().offset
+                        : 0;
 
   std::unordered_map<int32_t, size_t> key_index_map;
   for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
@@ -174,24 +174,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
   {
     Iterator output_it(_output, out_slice);
 
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const auto lookup = lookup_indices.at(id[lookup_dim]);
-                          if (lookup == NOT_HIT)
-                          {
-                            memset(output_it.ptr(), const_0,
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-                          else
-                          {
-                            Coordinates input_id{id};
-                            input_id.set(lookup_dim, lookup);
-                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-
-                        },
-                        output_it);
+    execute_window_loop(
+      out_slice,
+      [&](const Coordinates &id) {
+        const auto lookup = lookup_indices.at(id[lookup_dim]);
+        if (lookup == NOT_HIT)
+        {
+          memset(output_it.ptr(), const_0,
+                 _output->info()->dimension(0) * _output->info()->element_size());
+        }
+        else
+        {
+          Coordinates input_id{id};
+          input_id.set(lookup_dim, lookup);
+          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                 _output->info()->dimension(0) * _output->info()->element_size());
+        }
+      },
+      output_it);
 
   } while (window.slide_window_slice_4D(out_slice));
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 49adf14..4dc0f55 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -63,7 +63,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
 {
   /** NEON vector tag type. */
   using ExactTagType =
-      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
   // Clear X/Y dimensions on execution window as we handle the planes manually
   Window win = window;
@@ -73,107 +73,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
   constexpr int window_step_x = 16 / sizeof(T);
   const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
   const auto channel_idx =
-      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
   Iterator input_it(input, win);
   execute_window_loop(
-      win,
-      [&](const Coordinates &id) {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w = static_cast<T>(0.f);
-        auto sum_squares_h_w = static_cast<T>(0.f);
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
-                vec_sum_squares_h_w =
-                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
-              }
-
-              auto vec2_sum_h_w =
-                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
-                                                         wrapper::vgetlow(vec_sum_squares_h_w));
-              for (int i = 0; i < window_step_x / 4; ++i)
-              {
-                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-              }
-              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                const auto value = *(input_ptr + x);
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-              }
-            },
-            input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        auto gamma_val = 1.0f;
-        if (gamma != nullptr)
-        {
-          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
-        }
-        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
-        auto beta_val = 0.0f;
-        if (beta != nullptr)
-        {
-          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
-        }
-        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
-              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                vec_val = wrapper::vloadq(input_ptr + x);
-                vec_val = wrapper::vadd(
-                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
-                wrapper::vstore(output_ptr + x, vec_val);
-              }
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
-              }
-            },
-            input_plane_it, output_plane_it);
-      },
-      input_it);
+    win,
+    [&](const Coordinates &id) {
+      Window win_plane = window;
+      win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+      win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+      win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+      Iterator input_plane_it(input, win_plane);
+      Iterator output_plane_it(output, win_plane);
+
+      auto sum_h_w = static_cast<T>(0.f);
+      auto sum_squares_h_w = static_cast<T>(0.f);
+
+      execute_window_loop(
+        win_plane,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+          auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+          auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+          // Compute S elements per iteration
+          int x = window.x().start();
+          for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+          {
+            auto vec_input_val = wrapper::vloadq(input_ptr + x);
+            vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+            vec_sum_squares_h_w =
+              wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+          }
+
+          auto vec2_sum_h_w =
+            wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+          auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+                                                     wrapper::vgetlow(vec_sum_squares_h_w));
+          for (int i = 0; i < window_step_x / 4; ++i)
+          {
+            vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+          }
+          sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+          sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+          // Compute left-over elements
+          for (; x < window.x().end(); ++x)
+          {
+            const auto value = *(input_ptr + x);
+            sum_h_w += value;
+            sum_squares_h_w += value * value;
+          }
+        },
+        input_plane_it, output_plane_it);
+
+      const auto mean_h_w = sum_h_w / elements_plane;
+      const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+      auto gamma_val = 1.0f;
+      if (gamma != nullptr)
+      {
+        gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+      }
+      const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+      const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+      const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+      auto beta_val = 0.0f;
+      if (beta != nullptr)
+      {
+        beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+      }
+      const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+      execute_window_loop(
+        win_plane,
+        [&](const Coordinates &) {
+          auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+          auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+          // Compute S elements per iteration
+          int x = window.x().start();
+          auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+          for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+          {
+            vec_val = wrapper::vloadq(input_ptr + x);
+            vec_val = wrapper::vadd(
+              wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+            wrapper::vstore(output_ptr + x, vec_val);
+          }
+
+          // Compute left-over elements
+          for (; x < window.x().end(); ++x)
+          {
+            *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+          }
+        },
+        input_plane_it, output_plane_it);
+    },
+    input_it);
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
@@ -199,8 +199,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        gamma->dimension(0),
+                                      input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                      gamma->dimension(0),
                                     "Gamma's size must be the same as size of input's channel");
   }
 
@@ -208,8 +208,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        beta->dimension(0),
+                                      input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                      beta->dimension(0),
                                     "Beta's size must be the same as size of input's channel");
   }
 
@@ -234,8 +234,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
-      _epsilon(1e-12)
+  : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+    _epsilon(1e-12)
 {
 }
 
@@ -251,7 +251,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou
   _epsilon = epsilon;
 
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+    validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
 
   if (_input->info()->data_type() == DataType::F32)
   {
@@ -282,7 +282,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index b92130c..ad47281 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -123,15 +123,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
   const float32x4_t vscale = vdupq_n_f32(scale);
 
   const float32x4x4_t ret = {{
-      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
-      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
   }};
   return ret;
 }
 } // namespace
 
 NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
 {
 }
 
@@ -140,7 +142,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -180,25 +182,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w
   Iterator output(_output, win_collapsed);
   win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
   execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
-        scale *= _multiplier;
-
-        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
-        auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          output_ptr[x] = input_ptr[x] * scale;
-        }
-      },
-      input, output);
+    win_collapsed,
+    [&](const Coordinates &id) {
+      auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+      scale *= _multiplier;
+
+      const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+      auto output_ptr = reinterpret_cast<T *>(output.ptr());
+      int x = window_start_x;
+      for (; x <= (window_end_x - window_step); x += window_step)
+      {
+        store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+      }
+      // Compute left-over elements
+      for (; x < window_end_x; ++x)
+      {
+        output_ptr[x] = input_ptr[x] * scale;
+      }
+    },
+    input, output);
 }
 
 void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
index 0a11eb5..0daff5c 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -101,8 +101,8 @@ bool isOnValue(U index, U depth)
 } // namespace
 
 NEOneHotKernel::NEOneHotKernel()
-    : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
-      _output{nullptr}, _func{}
+  : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr},
+    _off_value{nullptr}, _axis{-1}, _output{nullptr}, _func{}
 {
 }
 
@@ -117,22 +117,22 @@ void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
   Iterator output_it(_output, output_window);
   const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
   execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        std::fill_n(output_it.ptr(),
-                    _output->info()->dimension(0) * _output->info()->element_size(), off_value);
-        Coordinates indices_id(id);
-        indices_id.remove(0);
-        const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-        {
-          Coordinates onehot_id(id);
-          onehot_id.set(0, new_index);
-          std::copy_n(_on_value->buffer(), _output->info()->element_size(),
-                      _output->ptr_to_element(onehot_id));
-        }
-      },
-      output_it);
+    output_window,
+    [&](const Coordinates &id) {
+      std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(),
+                  off_value);
+      Coordinates indices_id(id);
+      indices_id.remove(0);
+      const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+      if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+      {
+        Coordinates onehot_id(id);
+        onehot_id.set(0, new_index);
+        std::copy_n(_on_value->buffer(), _output->info()->element_size(),
+                    _output->ptr_to_element(onehot_id));
+      }
+    },
+    output_it);
 }
 
 template <typename U>
@@ -142,22 +142,22 @@ inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo
   // Validate that the indices are not negative
   validate_depth<U>(_depth, _output, _axis);
   Iterator output_it(_output, window);
-  execute_window_loop(window,
-                      [&](const Coordinates &id) {
-                        Coordinates indices_id(id);
-                        indices_id.remove(_axis);
-                        const U new_index =
-                            *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-                        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-                        {
-                          Coordinates onehot_id(id);
-                          onehot_id.set(_axis, new_index);
-                          std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
-                                                                             : _off_value->buffer(),
-                                      _output->info()->element_size(), output_it.ptr());
-                        }
-                      },
-                      output_it);
+  execute_window_loop(
+    window,
+    [&](const Coordinates &id) {
+      Coordinates indices_id(id);
+      indices_id.remove(_axis);
+      const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+      if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+      {
+        Coordinates onehot_id(id);
+        onehot_id.set(_axis, new_index);
+        std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
+                                                           : _off_value->buffer(),
+                    _output->info()->element_size(), output_it.ptr());
+      }
+    },
+    output_it);
 }
 
 void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
@@ -215,7 +215,7 @@ Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *d
                                 const ITensorInfo *output, int axis)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(indices, depth, on_value, off_value, output, axis));
+    validate_arguments(indices, depth, on_value, off_value, output, axis));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 5841f1d..2306228 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -107,19 +107,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
 
   const int32x4x4_t rf = {{
 #ifdef __aarch64__
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
 #else  //__aarch64__
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
 #endif //__aarch64__
   }};
   const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
@@ -129,7 +125,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
 } // namespace
 
 NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
-    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+  : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
 {
 }
 
@@ -138,7 +134,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), output->info(), scale_factor->info()));
+    validate_arguments(input->info(), output->info(), scale_factor->info()));
 
   _input = input;
   _output = output;
@@ -182,40 +178,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window
   const auto dim_x = _input->info()->dimension(0);
   win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
   execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        const auto start = reinterpret_cast<const T *>(input.ptr());
-        const auto min_max = std::minmax_element(start, start + dim_x);
-        const auto int8_scale = 127;
-        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
-        if (range == 0)
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
-          range = 1;
-        }
-        else
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
-        }
-        const auto scale_factor_inv = int8_scale / range;
-
-        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          wrapper::vstore(&output_ptr[x],
-                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
-          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
-          output_ptr[x] = static_cast<int8_t>(quantized);
-        }
-      },
-      input, output);
+    win_collapsed,
+    [&](const Coordinates &id) {
+      const auto start = reinterpret_cast<const T *>(input.ptr());
+      const auto min_max = std::minmax_element(start, start + dim_x);
+      const auto int8_scale = 127;
+      auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+      if (range == 0)
+      {
+        *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+        range = 1;
+      }
+      else
+      {
+        *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+      }
+      const auto scale_factor_inv = int8_scale / range;
+
+      auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+      auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+      int x = window_start_x;
+      for (; x <= (window_end_x - window_step); x += window_step)
+      {
+        wrapper::vstore(&output_ptr[x],
+                        vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+      }
+      // Compute left-over elements
+      for (; x < window_end_x; ++x)
+      {
+        int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+        quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+        output_ptr[x] = static_cast<int8_t>(quantized);
+      }
+    },
+    input, output);
 }
 
 void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
index 267228e..b02a48e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -50,8 +50,8 @@
 namespace arm_compute
 {
 CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
-      _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+  : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
+    _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
 {
 }
 
@@ -60,13 +60,13 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
+                                    op != ReductionOperation::ARG_IDX_MIN,
                                   "Invalid reduction operation");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
                                   "Reduction axis greater than max number of dimensions");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
   const unsigned int num_of_stages =
-      calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
   DataType output_data_type = DataType::S32;
   TensorInfo not_reshaped_output;
@@ -76,9 +76,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   if (output->total_size() != 0)
   {
     output_data_type = output->data_type();
-    const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
-        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis,
-                                                                   false));
+    const TensorInfo expected_output_shape =
+      output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(
+        input->tensor_shape(), axis, false));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
   }
 
@@ -87,9 +87,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
                                   int num_channels, QuantizationInfo qinfo) {
     ti.set_data_type(data_type)
-        .set_tensor_shape(shape)
-        .set_num_channels(num_channels)
-        .set_quantization_info(qinfo);
+      .set_tensor_shape(shape)
+      .set_num_channels(num_channels)
+      .set_quantization_info(qinfo);
   };
 
   initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
@@ -98,7 +98,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   if (num_of_stages == 1)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
+      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
   }
   else
   {
@@ -118,19 +118,19 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
 
     // Validate ReductionOperation only on first kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
+      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
 
     // Validate ReductionOperation on intermediate stages
     for (unsigned int i = 1; i < num_of_stages - 1; ++i)
     {
-      ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1],
-                                                                     &sums_vector[i], axis, op));
+      ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
     }
 
     // Validate ReductionOperation on the last stage
     const unsigned int last_stage = num_of_stages - 1;
     ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
-        input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
+      input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
   }
   ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
   return Status{};
@@ -144,16 +144,16 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
   _reduction_axis = axis;
 
   const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
-      input->info()->tensor_shape(), axis, false);
+    input->info()->tensor_shape(), axis, false);
   DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
-                                  ? DataType::S32
-                                  : output->info()->data_type();
+                                ? DataType::S32
+                                : output->info()->data_type();
   auto_init_if_empty(*output->info(), input->info()
-                                          ->clone()
-                                          ->set_tensor_shape(output_shape)
-                                          .set_data_type(output_data_type)
-                                          .reset_padding()
-                                          .set_is_resizable(true));
+                                        ->clone()
+                                        ->set_tensor_shape(output_shape)
+                                        .set_data_type(output_data_type)
+                                        .reset_padding()
+                                        .set_is_resizable(true));
 
   // Configure reduction operation kernels
   _reduction_kernels_vector.resize(_num_of_stages);
@@ -166,11 +166,11 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
     TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
     auto_init_if_empty(*_not_reshaped_output.info(), input->info()
-                                                         ->clone()
-                                                         ->set_tensor_shape(output_shape)
-                                                         .set_data_type(output_data_type)
-                                                         .reset_padding()
-                                                         .set_is_resizable(true));
+                                                       ->clone()
+                                                       ->set_tensor_shape(output_shape)
+                                                       .set_data_type(output_data_type)
+                                                       .reset_padding()
+                                                       .set_is_resizable(true));
     _not_reshaped_output.info()->set_tensor_shape(output_shape);
     _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
   }
@@ -182,7 +182,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
     {
       shape.set(0, ceil(shape.x() / 128.f));
       _results_vector[i].allocator()->init(
-          input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
+        input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
     }
 
     // Apply ReductionOperation only on first kernel
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 3dede05..6359b4b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -53,16 +53,10 @@ namespace arm_compute
 using namespace arm_compute::misc::shape_calculator;
 
 CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
-    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _flip_axis(),
-      _is_prepared(false)
+  std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+  : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(),
+    _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(),
+    _is_prepared(false)
 {
 }
 
@@ -74,7 +68,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
   const DataLayout data_layout = input->data_layout();
 
@@ -86,8 +80,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+    input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+    weights->dimension(idx_h), info, invalid_right, invalid_bottom);
 
   const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
 
@@ -117,19 +111,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
   unsigned int pad_right = 0;
   unsigned int pad_top = 0;
   unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
+  const TensorShape scale_out_shape =
+    compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+                                          invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
   TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
+                              ->set_is_resizable(true)
+                              .reset_padding()
+                              .set_tensor_shape(scale_out_shape)
+                              .set_data_layout(data_layout));
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
   return Status{};
 }
@@ -171,22 +165,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
+    input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+    weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+    invalid_bottom);
 
   const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+    compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
 
   // Output auto initialization if not yet initialized
   auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    *output->info(),
+    input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
   // Perform validation step
   ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
+    input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info,
+    invalid_right, invalid_bottom));
 
   _is_prepared = weights_info.retain_internal_weights();
 
@@ -195,8 +189,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
   // to match output shape
   const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
+    *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+    pad_right, pad_top, pad_bottom);
 
   TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
                             input->info()->quantization_info());
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 0198946..79d0929 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
   ARM_COMPUTE_UNUSED(weights);
   ARM_COMPUTE_UNUSED(output);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+    CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -80,12 +80,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
 }
 
 CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
-      _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
-      _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
-      _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
-      _original_weights(nullptr)
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
+    _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
+    _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
+    _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
+    _original_weights(nullptr)
 {
 }
 void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -107,8 +107,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _accumulate_biases = false;
@@ -140,10 +140,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
   bool is_fc_after_conv = false;
   if (is_batched_fc_layer)
   {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                   input->info()->tensor_shape().cend(),
-                                   output->info()->tensor_shape().cbegin() + 1));
+    is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -158,28 +158,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
   {
     // Reshape the weights
     _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
+      weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+        compute_transposed_shape(*weights->info())));
     _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
     weights_to_use = &_reshape_weights_output;
   }
 
   // Extract scale factor
   _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
+    TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
   _memory_group.manage(&_scale_factor);
   _scale_factor_kernel.configure(input, &_scale_factor);
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+      DataType::QASYMM8_SIGNED));
   _memory_group.manage(&_quantized_input);
   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
 
   // GEMMLowp
   _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   _memory_group.manage(&_gemmlowp_output);
   configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
                fc_info.retain_internal_weights);
@@ -209,15 +209,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   const GPUTarget gpu_target = CLScheduler::get().target();
 
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+      CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
   }
 
   // With the Fully Connected layer we can have 4 different cases:
@@ -247,33 +247,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+      CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
   // Validate Scale factor kernel
   const ITensorInfo &scale_factor =
-      TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
+    TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
 
   // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+  const ITensorInfo &quantized_input = TensorInfo(
+    input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+    CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
 
   // Fully Connected layer after a Fully Connected Layer without batches
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate matrix multiply kernel
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
 
   // Multiply scale
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
+    CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 2ff4b96..13d3acb 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
     int output_multiplier = 0;
     int output_shift = 0;
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
-        multiplier, &output_multiplier, &output_shift));
+      multiplier, &output_multiplier, &output_shift));
 
     // Set the GEMMLowp output stage info
     gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
@@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 {
   GEMMLowpOutputStageInfo gemmlowp_output_stage;
   ARM_COMPUTE_RETURN_ON_ERROR(
-      construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+    construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
 
   const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
                                        false, // is_b_reshaped
@@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
-        gemm_info));
+      &input.clone()->set_quantization_info(input_quantization_info),
+      &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+      gemm_info));
   }
   else
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
+      CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
   }
 
   return Status{};
@@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
 
 CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
                                                  IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
-      _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
-      _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
-      _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
-      _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+  : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
+    _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
+    _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
+    _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
+    _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+    _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
 {
 }
 void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens
     const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
     input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+      input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
     weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+      weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
     _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
@@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
                                                 const FullyConnectedLayerInfo &fc_info)
 {
   ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+    (weights->info()->dimension(1) !=
+     (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
   // If the fully connected layer is called after a convolution layer, the input tensor must be
   // linearized
@@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
   // Initialize output tensor for flatten
   TensorShape shape_flatten = compute_flatten_shape(input->info());
   _flatten_output.allocator()->init(input->info()
-                                        ->clone()
-                                        ->set_is_resizable(true)
-                                        .reset_padding()
-                                        .set_tensor_shape(shape_flatten)
-                                        .set_data_layout(DataLayout::NCHW));
+                                      ->clone()
+                                      ->set_is_resizable(true)
+                                      .reset_padding()
+                                      .set_tensor_shape(shape_flatten)
+                                      .set_data_layout(DataLayout::NCHW));
 
   // Configure flatten kernel
   _memory_group.manage(&_flatten_output);
@@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_converted = true;
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
     {
       _reshape_weights_managed_function.configure(weights);
       weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_reshape_weights_managed_function));
+        _weights_manager->acquire(weights, &_reshape_weights_managed_function));
     }
     else
     {
@@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
       _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
                                          fc_info.weights_trained_layout);
       weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_convert_weights_managed));
+        _weights_manager->acquire(weights, &_convert_weights_managed));
     }
     else
     {
@@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   bool is_fc_after_conv = true;
 
   const ITensorInfo &flatten_input = TensorInfo(input->clone()
-                                                    ->set_is_resizable(true)
-                                                    .reset_padding()
-                                                    .set_tensor_shape(compute_flatten_shape(input))
-                                                    .set_data_layout(DataLayout::NCHW));
+                                                  ->set_is_resizable(true)
+                                                  .reset_padding()
+                                                  .set_tensor_shape(compute_flatten_shape(input))
+                                                  .set_data_layout(DataLayout::NCHW));
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
   const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
+    weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                     : TensorInfo(*reshaped_weights.clone());
 
   // With the Fully Connected layer we can have 4 different cases:
   //  1) Convolution layer -> Fully Connected layer without batches
@@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+      CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate convert weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+      weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
     weights_to_use = &converted_weights;
   }
 
@@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Fully Connected layer after a Convolution Layer without batches
     ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+      (weights_to_use->dimension(1) !=
+       (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
     ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
@@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
 
   // Validate matrix multiply kernel
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
+    validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
 
   return Status{};
 }
@@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run()
       if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
       {
         _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
-            _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+          _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
       }
       else
       {
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 157b4d9..ac6982e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -41,7 +41,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     // reshape
     auto_init_if_empty(*_cl_buffer.info(),
                        _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
-                           _input->info()->data_layout()));
+                         _input->info()->data_layout()));
     _cl_reshape.configure(_input, &_cl_buffer);
     input_to_use = &_cl_buffer;
   }
@@ -57,7 +57,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     {
       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
                         input->info()->data_type() == DataType::F16) &&
-                       (weights->info()->data_type() == DataType::S8 ||
+                       (weights->info()->data_type() == DataType::QSYMM8 ||
                         weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
@@ -81,7 +81,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     {
       throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
     }
-
   }();
 
   if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 02ee4ad..c246041 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -46,8 +46,8 @@
 using namespace arm_compute;
 
 CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
-      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+  : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+    _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
 {
 }
 
@@ -91,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   for (size_t i = 0; i < num_of_kernels; ++i, ++it)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+      CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
   }
 
   if (!keep_dims)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+      CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
index a502f03..12c0aa8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -134,8 +134,8 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou
     // Output auto inizialitation if not yet initialized
     TensorInfo tmp_output_info = *output->info()->clone();
     auto_init_if_empty(
-        tmp_output_info,
-        input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+      tmp_output_info,
+      input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
 
     // Update coordinate on axis
     start_coords.set(split_dim, axis_offset);
@@ -153,7 +153,7 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou
 } // namespace
 
 CLSplitVEx::CLSplitVEx()
-    : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
+  : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
 {
 }
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 3ac95a8..accd513 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -49,14 +49,14 @@ namespace arm_compute
 {
 
 CLTopKV2::CLTopKV2()
-    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
-      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
-      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
-      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
-      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
-       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
-       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
-       _reorder_negatives_kernel(), _store_kernel()*/
+  : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+    _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+    _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+    _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr),
+    _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+    _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+    _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+    _reorder_negatives_kernel(), _store_kernel()*/
 {
 }
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 3215d01..0754fd8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -53,7 +53,7 @@ using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
 CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+  : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
@@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   switch (CLTransposeConvLayer::get_deconvolution_method(
-      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+    input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
   {
     case DeconvolutionMethod::DIRECT:
     {
       // Validate direct convolution layer
       ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
-          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+        input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
       break;
     }
     case DeconvolutionMethod::GEMM:
     {
       // Validate gemm-based convolution layer
       ARM_COMPUTE_RETURN_ON_ERROR(
-          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+        CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
       break;
     }
     default:
@@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 }
 
 DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
-    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
-    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
-    unsigned int invalid_bottom, const WeightsInfo &weights_info)
+  const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+  ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+  unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index a123439..e212a03 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
 Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+    NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
 }
 
 NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
-      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
-      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
-      _accumulate_biases(false), _is_prepared(false)
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+    _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+    _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+    _accumulate_biases(false), _is_prepared(false)
 {
 }
 
@@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _accumulate_biases = false;
@@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
   bool _is_fc_after_conv;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
   {
     // Reshape the weights
     _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
+      weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+        compute_transposed_shape(*weights->info())));
     _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
     weights_to_use = &_reshape_weights_output;
   }
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+      DataType::QASYMM8_SIGNED));
   _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+    TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
   _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
 
   // GEMM
   _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
 
   // Multiply scale
@@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
 
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr)
@@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+      NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate quantization kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+  const ITensorInfo &quantized_input = TensorInfo(
+    input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
   const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+    NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
 
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   // Validate matrix multiply kernel
   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
 
   ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
-      &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
+    &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index cb7557a..a639f29 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -69,14 +69,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+      &input.clone()->set_quantization_info(input_quantization_info),
+      &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
   }
   else
   {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
-        &input, &weights, nullptr, &output, 1.f, 0.0f,
-        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+      NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
   }
 
   return Status{};
@@ -84,12 +84,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 } // namespace
 
 NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
-      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
-      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
-      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
-      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
-      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+  : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+    _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+    _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(),
+    _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true),
+    _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
+    _is_quantized(false), _is_prepared(false)
 {
 }
 
@@ -105,9 +105,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
     const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
     input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+      input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
     weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+      weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
     _mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -129,8 +129,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
                                                 ITensor *output)
 {
   ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+    (weights->info()->dimension(1) !=
+     (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
   // If the fully connected layer is called after a convolution layer, the input tensor must be
   // linearized
@@ -138,8 +138,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
   // Initialize output tensor for flatten
   TensorShape shape_flatten = compute_flatten_shape(input->info());
   _flatten_output.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          shape_flatten));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
   // Configure flatten kernel
   _memory_group.manage(&_flatten_output);
@@ -169,8 +168,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_converted = true;
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   if (_is_quantized)
   {
     _gemmlowp_output.allocator()->init(
-        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-            DataType::S32));
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   }
 
   // Configure accumulate biases kernel for non quantized asymmetric types
@@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
   const ITensorInfo &flatten_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_flatten_shape(input)));
+    TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_flatten_shape(input)));
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
   const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
+    weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                     : TensorInfo(*reshaped_weights.clone());
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr && !is_quantized)
@@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+      NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate convert weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+      weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
     weights_to_use = &converted_weights;
   }
 
@@ -346,8 +344,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Fully Connected layer after a Convolution Layer without batches
     ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+      (weights_to_use->dimension(1) !=
+       (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
@@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   if (is_quantized)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
-        &gemmlowp_output, biases, output));
+      &gemmlowp_output, biases, output));
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index dc6c784..234c783 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -56,7 +56,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
       assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
 
       bool is_hybrid = input->info()->data_type() == DataType::F32 &&
-                       (weights->info()->data_type() == DataType::S8 ||
+                       (weights->info()->data_type() == DataType::QSYMM8 ||
                         weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
index 16d74e6..451aa09 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -46,9 +46,9 @@
 namespace arm_compute
 {
 NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
-      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+    _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
 {
 }
 
@@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const
                                                 float epsilon)
 {
   return NEInstanceNormalizationLayerKernelEx::validate(
-      &input->clone()->set_data_layout(DataLayout::NCHW),
-      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+    &input->clone()->set_data_layout(DataLayout::NCHW),
+    &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
 }
 
 void NEInstanceNormalizationLayerEx::run()
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index cb1a263..c45c335 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -49,8 +49,8 @@
 using namespace arm_compute;
 
 NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
+  : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+    _reduction_ops(), _keep_dims()
 {
 }
 
@@ -125,7 +125,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a
   for (unsigned int i = 0; i < _reduction_ops; ++i)
   {
     TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+      i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
     out_shape.set(axis_local[i], 1);
     auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index 26a8879..b21717e 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -47,8 +47,8 @@
 using namespace arm_compute;
 
 NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
+  : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+    _reduction_ops(), _keep_dims()
 {
 }
 
@@ -122,7 +122,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
   for (unsigned int i = 0; i < _reduction_ops; ++i)
   {
     TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+      i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
     out_shape.set(axis_local[i], 1);
     auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
@@ -135,7 +135,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
       _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
                                                     input->info()->data_type(),
                                                     input->info()->quantization_info())
-                                             .set_data_layout(input->info()->data_layout()));
+                                           .set_data_layout(input->info()->data_layout()));
       _memory_group.manage(&_reduced_outs[i]);
       _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
                                       ReductionOperation::SUM);
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index aa165cc..5031107 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -51,17 +51,9 @@ namespace arm_compute
 {
 
 NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _conv_f(),
-      _upsample_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _weights_flipped(),
-      _flip_axis(),
-      _original_weights(nullptr),
-      _input(nullptr),
-      _info(),
-      _is_prepared(false)
+  : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(),
+    _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr),
+    _info(), _is_prepared(false)
 {
 }
 
@@ -76,15 +68,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
   const unsigned int width_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
-      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+    input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+    weights->dimension(height_idx), info, invalid_right, invalid_bottom);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
   if (bias != nullptr)
@@ -117,24 +109,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
   unsigned int pad_right = 0;
   unsigned int pad_top = 0;
   unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
+  const TensorShape scale_out_shape =
+    compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+                                          invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
   TensorInfo scale_out_info(
-      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const unsigned int batches_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
   const unsigned int channel_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
                               scale_out_info.dimension(batches_idx));
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
                               scale_out_info.dimension(channel_idx));
 
-  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, WeightsInfo()));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
 
   return Status{};
 }
@@ -146,21 +138,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
   // Perform validation step
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
-      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
+    input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+    info, invalid_right, invalid_bottom));
 
   const DataLayout data_layout = input->info()->data_layout();
   const unsigned int width_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
-      invalid_right, invalid_bottom);
+    input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+    weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+    invalid_right, invalid_bottom);
 
   const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+    compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
 
   _input = input;
   _original_weights = weights;
@@ -188,8 +180,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
+    *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+    pad_right, pad_top, pad_bottom);
 
   const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
                                     DimensionRoundingType::FLOOR);
diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h
new file mode 100644
index 0000000..cc6a9db
--- /dev/null
+++ b/compute/cker/include/cker/CpuBackendThreadpool.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+
+#include <ruy/context.h>     // from @ruy
+#include <ruy/thread_pool.h> // from @ruy
+
+namespace nnfw
+{
+namespace cker
+{
+namespace cpu_backend_threadpool
+{
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
+{
+  assert(tasks_count <= ruy_context->max_num_threads());
+  ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks);
+}
+
+} // namespace cpu_backend_threadpool
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index e080406..8bf0bee 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -131,7 +131,7 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co
   const int kWeightsPerUint32 = 4;
 
   int8 *shuffled_vectors = reinterpret_cast<int8 *>(
-      aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+    aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
 
   for (int i = 0; i < n_batch; i += 4)
   {
@@ -145,25 +145,25 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co
     while (unshuffled_vec0_ptr != end_vec0_ptr)
     {
       asm volatile(
-          // This code path requires that (n_cols % 16) == 0 so we can safely
-          // read in 16-byte chunks from each row.
-          "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
-          "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
-          "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
-          "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
-
-          "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
-
-          : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
-            [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
-            [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
-            [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
-            [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
-          :
-          : "v0", "v1", "v2", "v3", "cc", "memory");
+        // This code path requires that (n_cols % 16) == 0 so we can safely
+        // read in 16-byte chunks from each row.
+        "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+        "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+        "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+        "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+        "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+        : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
+          [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
+          [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
+          [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
+          [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
+        :
+        : "v0", "v1", "v2", "v3", "cc", "memory");
     }
   }
 
@@ -204,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
       const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
 
       asm volatile(
-          // Zero out the accumulator registers.
-          "dup v0.4s, wzr\n"
-          "dup v1.4s, wzr\n"
-          "dup v2.4s, wzr\n"
-          "dup v3.4s, wzr\n"
-
-          "1:\n" // batch_cols_loop
-
-          // Read 16 more bytes from a pair of matrix rows.
-          "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-
-          // Prefetch two rows ahead.
-          "prfm pldl1strm, [%[mat_ptr2]]\n"
-          "prfm pldl1strm, [%[mat_ptr3]]\n"
-
-          // Read from input vectors 4 times; 64 bytes total.
-          // Each 16-byte register contains parts of 4 vectors; see the
-          // shuffle logic above.
-
-          // From Benoit, places to look in the future:
-          // - Move load instructions further from sdot
-          // - Switch loop use-then-reload
-          // - Do partial unrolling to use register space better
-          "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-          "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-          "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-          "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-
-          // Update prefetch pointers.
-          "add %[mat_ptr2], %[mat_ptr2], #16\n"
-          "add %[mat_ptr3], %[mat_ptr3], #16\n"
-
-          // Re-use those vectors for the next row as well.
-          "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-          ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-          ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-          ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-          ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-
-          // If we're not done with these rows, continue.
-          "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-          "bne 1b\n" // batch_cols_loop
-
-          // Done with the rows, sum the results.
-          "add v0.4s, v0.4s, v1.4s\n"
-          "add v2.4s, v2.4s, v3.4s\n"
-
-          // Convert the per-vector sums to floating point.
-          "scvtf v0.4s, v0.4s\n"
-          "scvtf v1.4s, v2.4s\n"
-
-          // Fetch scale factors.
-          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-
-          // Multiply scale factors times sums.
-          "fmul v0.4s, v4.4s, v0.4s\n"
-          "fmul v1.4s, v4.4s, v1.4s\n"
-
-          // Load previous result values.
-          // The result position is:
-          //   result[batch * m_rows + row]
-          // Here that is factored into:
-          //   result_ptr = result + row
-          //   *result_ptr = res[0]
-          //   (uint8*)result_ptr += (m_rows * sizeof(float))
-          //   *result_ptr = res[1]
-          //   ...
-          // Since we're reading two rows at a time, though, we read both
-          //   result[batch * m_rows + row]
-          // and
-          //   result[batch * m_rows + row + 1]
-          "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-
-          // Go back to the starting position (subtract wide_rows * 4).
-          "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-
-          // Add previous result values.
-          "fadd v9.4s, v9.4s, v0.4s\n"
-          "fadd v10.4s, v10.4s, v1.4s\n"
-
-          // Store results.
-          "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-            [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
-          : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
-            [wide_rows] "r"(wide_rows)
-          : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-            "v13", "cc", "memory");
+        // Zero out the accumulator registers.
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+
+        "1:\n" // batch_cols_loop
+
+        // Read 16 more bytes from a pair of matrix rows.
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+        // Prefetch two rows ahead.
+        "prfm pldl1strm, [%[mat_ptr2]]\n"
+        "prfm pldl1strm, [%[mat_ptr3]]\n"
+
+        // Read from input vectors 4 times; 64 bytes total.
+        // Each 16-byte register contains parts of 4 vectors; see the
+        // shuffle logic above.
+
+        // From Benoit, places to look in the future:
+        // - Move load instructions further from sdot
+        // - Switch loop use-then-reload
+        // - Do partial unrolling to use register space better
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+        // Update prefetch pointers.
+        "add %[mat_ptr2], %[mat_ptr2], #16\n"
+        "add %[mat_ptr3], %[mat_ptr3], #16\n"
+
+        // Re-use those vectors for the next row as well.
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+        // If we're not done with these rows, continue.
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 1b\n" // batch_cols_loop
+
+        // Done with the rows, sum the results.
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        // Convert the per-vector sums to floating point.
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Fetch scale factors.
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+        // Multiply scale factors times sums.
+        "fmul v0.4s, v4.4s, v0.4s\n"
+        "fmul v1.4s, v4.4s, v1.4s\n"
+
+        // Load previous result values.
+        // The result position is:
+        //   result[batch * m_rows + row]
+        // Here that is factored into:
+        //   result_ptr = result + row
+        //   *result_ptr = res[0]
+        //   (uint8*)result_ptr += (m_rows * sizeof(float))
+        //   *result_ptr = res[1]
+        //   ...
+        // Since we're reading two rows at a time, though, we read both
+        //   result[batch * m_rows + row]
+        // and
+        //   result[batch * m_rows + row + 1]
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+        // Go back to the starting position (subtract wide_rows * 4).
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+        // Add previous result values.
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+
+        // Store results.
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "cc", "memory");
     }
   }
 
@@ -309,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
 }
 
 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   void *shuffled_vectors_free;
   const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
@@ -332,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
       const int32_t *batch_offsets_ptr = input_offset + batch;
       const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
       const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
-      asm volatile("dup v0.4s, wzr\n"
-                   "dup v1.4s, wzr\n"
-                   "dup v2.4s, wzr\n"
-                   "dup v3.4s, wzr\n"
-                   // Load zero points.
-                   "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
-                   "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-                   // Zero out zero point accumulators.
-                   "dup v14.4s, wzr\n"
-                   "dup v15.4s, wzr\n"
-
-                   // Load per channel scales if not null.
-                   "cmp %w[is_channel_scale_nullptr], #0\n"
-                   "bne 1f\n"
-                   "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
-                   "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
-                   "fmul v16.4s, v16.4s, v4.4s\n"
-                   "fmul v17.4s, v17.4s, v4.4s\n"
-                   "b 2f\n"
-                   "1:\n"
-                   "mov v16.16b, v4.16b\n"
-                   "mov v17.16b, v4.16b\n"
-                   "2:\n"
-                   "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-                   "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-                   "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-                   "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-                   "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-                   "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-                   ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-                   ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-                   ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-                   ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 3f\n"
-                   // Accumulate row_sums for zero point calculations.
-                   "saddlp v12.8h, v12.16b\n"
-                   "saddlp v13.8h, v13.16b\n"
-                   "sadalp v14.4s, v12.8h\n"
-                   "sadalp v15.4s, v13.8h\n"
-                   "3:\n"
-                   "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-                   "bne 2b\n"
-                   "add v0.4s, v0.4s, v1.4s\n"
-                   "add v2.4s, v2.4s, v3.4s\n"
-
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 4f\n"
-                   // Calculate zero point offsets.
-                   "addv s14, v14.4s\n"
-                   "addv s15, v15.4s\n"
-                   "dup v14.4s, v14.s[0]\n"
-                   "dup v15.4s, v15.s[0]\n"
-                   "b 5f\n"
-                   "4:\n"
-                   "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
-                   "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
-                   "5:\n"
-
-                   "mul v14.4s, v14.4s, v7.4s\n"
-                   "mul v15.4s, v15.4s, v7.4s\n"
-                   "sub v0.4s, v0.4s, v14.4s\n"
-                   "sub v2.4s, v2.4s, v15.4s\n"
-
-                   "scvtf v0.4s, v0.4s\n"
-                   "scvtf v1.4s, v2.4s\n"
-
-                   // Multiply scale.
-                   "fmul v0.4s, v16.4s, v0.4s\n"
-                   "fmul v1.4s, v17.4s, v1.4s\n"
-
-                   "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-                   "fadd v9.4s, v9.4s, v0.4s\n"
-                   "fadd v10.4s, v10.4s, v1.4s\n"
-                   "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-                     [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
-                   : [mat_ptr0_end] "r"(mat_ptr0_end),
-                     [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows),
-                     [channel_scales_ptr] "r"(channel_scales_ptr),
-                     [batch_offsets_ptr] "r"(batch_offsets_ptr),
-                     [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
-                     [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
-                   : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-                     "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
+      asm volatile(
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+        // Load zero points.
+        "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+        // Zero out zero point accumulators.
+        "dup v14.4s, wzr\n"
+        "dup v15.4s, wzr\n"
+
+        // Load per channel scales if not null.
+        "cmp %w[is_channel_scale_nullptr], #0\n"
+        "bne 1f\n"
+        "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
+        "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
+        "fmul v16.4s, v16.4s, v4.4s\n"
+        "fmul v17.4s, v17.4s, v4.4s\n"
+        "b 2f\n"
+        "1:\n"
+        "mov v16.16b, v4.16b\n"
+        "mov v17.16b, v4.16b\n"
+        "2:\n"
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 3f\n"
+        // Accumulate row_sums for zero point calculations.
+        "saddlp v12.8h, v12.16b\n"
+        "saddlp v13.8h, v13.16b\n"
+        "sadalp v14.4s, v12.8h\n"
+        "sadalp v15.4s, v13.8h\n"
+        "3:\n"
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 2b\n"
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 4f\n"
+        // Calculate zero point offsets.
+        "addv s14, v14.4s\n"
+        "addv s15, v15.4s\n"
+        "dup v14.4s, v14.s[0]\n"
+        "dup v15.4s, v15.s[0]\n"
+        "b 5f\n"
+        "4:\n"
+        "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+        "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+        "5:\n"
+
+        "mul v14.4s, v14.4s, v7.4s\n"
+        "mul v15.4s, v15.4s, v7.4s\n"
+        "sub v0.4s, v0.4s, v14.4s\n"
+        "sub v2.4s, v2.4s, v15.4s\n"
+
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Multiply scale.
+        "fmul v0.4s, v16.4s, v0.4s\n"
+        "fmul v1.4s, v17.4s, v1.4s\n"
+
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr),
+          [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
+          [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
+          [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
     }
   }
 
@@ -458,9 +458,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 // We don't use this kernel when n_batch = 1 because the baseline kernel
 // is fine for that case.
 inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   const int kWeightsPerUint32 = 4;
 
@@ -475,14 +475,14 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_vectors_free;
   const int padded_vectors_size = batch_round_up * m_cols;
   int8_t *padded_vectors = reinterpret_cast<int8_t *>(
-      aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
+    aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
   memset(padded_vectors, 0, padded_vectors_size);
 
   void *padded_result_free;
   const int result_size = n_batch * m_rows * sizeof(float);
   const int padded_result_size = batch_round_up * m_rows * sizeof(float);
   float *padded_result = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
+    aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
   memcpy(padded_result, result, result_size);
   memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
          padded_result_size - result_size);
@@ -494,7 +494,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_scaling_factors_free;
   const int padded_scaling_factors_size = batch_round_up * sizeof(float);
   float *padded_scaling_factors = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
+    aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
   assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
   assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
   memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
@@ -505,7 +505,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
     void *padded_input_offset_free;
     const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
     int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
-        aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
+      aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
     assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
     assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
     memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
@@ -513,8 +513,8 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
 
     // Call the main kernel.
     DotprodMatrixBatchFourVectorMultiplyAccumulate(
-        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up,
-        padded_result, per_channel_scale, padded_input_offset, row_sums);
+      matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result,
+      per_channel_scale, padded_input_offset, row_sums);
 
     free(padded_input_offset_free);
   }
@@ -533,13 +533,13 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
 }
 
 inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result)
 {
   DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
-      /*row_sums=*/nullptr);
+    matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+    /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+    /*row_sums=*/nullptr);
 }
 #endif // __aarch64__
 
@@ -736,7 +736,7 @@ inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
   for (int i = postamble_start; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+      static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 }
@@ -830,7 +830,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
         // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
         // performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
@@ -855,7 +855,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
         // Here the assumption is that each buffer is 4-bytes aligned.
         // Otherwise, performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
         const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
         const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -952,7 +952,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
       const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
       const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
       const float32x4_t result1 =
-          vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+        vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
       vst1q_f32(result, result0);
       vst1q_f32(result + 4 * result_stride, result1);
     }
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
index 3b3b27f..2a58a2e 100644
--- a/compute/cker/include/cker/PortableTensorUtils.h
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -138,7 +138,7 @@ inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
   for (int i = 0; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
+      static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index acb6cac..10f3ecb 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -389,6 +389,11 @@ struct SpaceToDepthParams
   int32_t block_size;
 };
 
+struct LeakyReluParams
+{
+  float alpha;
+};
+
 enum class Order
 {
   kColMajor,
@@ -475,9 +480,9 @@ enum class QuantizationFlavor
 // (only those that need perchannel quantization do).
 template <typename AccumScalar, typename DstScalar,
           QuantizationFlavor quantization_flavor =
-              std::is_floating_point<AccumScalar>::value
-                  ? QuantizationFlavor::kFloatingPoint
-                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+            std::is_floating_point<AccumScalar>::value
+              ? QuantizationFlavor::kFloatingPoint
+              : QuantizationFlavor::kIntegerWithUniformMultiplier>
 struct GemmParams
 {
   // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
@@ -504,12 +509,12 @@ struct GemmParams
   const AccumScalar *bias = nullptr;
   // min clamp bound of destination values.
   DstScalar clamp_min = std::is_floating_point<DstScalar>::value
-                            ? -std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::lowest();
+                          ? -std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::lowest();
   // max clamp bound of destination values.
   DstScalar clamp_max = std::is_floating_point<DstScalar>::value
-                            ? std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::max();
+                          ? std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::max();
 };
 
 // Validates self-consistency of GemmParams.
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index 2abb998..f73c015 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -88,8 +88,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
-      right_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+    right_shift);
 }
 
 inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
@@ -103,7 +103,7 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
                                                               int left_shift)
 {
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
 inline int NodeOffset(int b, int h, int w, int height, int width)
@@ -162,7 +162,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
   const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
   const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
   // Newton-Raphson iteration
   // Naive unoptimized starting guess: x = 1
   F3 x = F3::One();
@@ -173,7 +173,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
     x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
   }
   const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
   x = x * fixedpoint_half_sqrt_2;
   *output_inv_sqrt = x.raw();
   if (*output_shift < 0)
@@ -429,7 +429,7 @@ template <typename T> class SequentialTensorWriter
 {
 public:
   SequentialTensorWriter(const T *input_data, T *output_data)
-      : input_data_(input_data), output_ptr_(output_data)
+    : input_data_(input_data), output_ptr_(output_data)
   {
   }
 
diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h
index 49c3421..e3b1099 100644
--- a/compute/cker/include/cker/eigen/EigenSupport.h
+++ b/compute/cker/include/cker/eigen/EigenSupport.h
@@ -39,17 +39,17 @@ namespace eigen_support
 // library.
 typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenMatrix;
+  EigenMatrix;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenMatrix;
+  ConstEigenMatrix;
 
 typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenTensor;
+  EigenTensor;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenTensor;
+  ConstEigenTensor;
 
 // Utility functions we need for the EigenTensor API.
 template <typename Device, typename T> struct MatMulConvFunctor
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
index f9c7063..40cb854 100644
--- a/compute/cker/include/cker/eigen/Utils.h
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -36,9 +36,9 @@ namespace cker
 //    Eigen::Map<Eigen::Matrix<const float, ...>>
 template <typename Scalar>
 using VectorMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
 
 template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape)
 {
@@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha
 // above also applies here.
 template <typename Scalar>
 using MatrixMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
-                                   Eigen::Dynamic>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<
+    const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
 
 template <typename Scalar>
 MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
index dc3e255..9d4fd2e 100644
--- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
+++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
@@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket
 public:
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(
-      typename std::enable_if<
-          unpacket_traits<PacketT>::masked_load_available &&
-          std::is_same<
-              PacketT,
-              decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
-                  std::declval<IndexT>(),
-                  std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
-      -> std::true_type;
+    typename std::enable_if<
+      unpacket_traits<PacketT>::masked_load_available &&
+      std::is_same<PacketT,
+                   decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
+                     std::declval<IndexT>(),
+                     std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
+    -> std::true_type;
 
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(...) -> std::false_type;
 
   typedef decltype(
-      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
+    functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
 
   static constexpr bool value = status::value;
 };
@@ -71,9 +70,9 @@ public:
 // [from, to) range. If the mask bit is 1, element will be loaded/stored.
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
-                            typename unpacket_traits<Packet>::mask_t>::type
-    mask(int from, int to)
+  typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                          typename unpacket_traits<Packet>::mask_t>::type
+  mask(int from, int to)
 {
   const Index packet_size = internal::unpacket_traits<Packet>::size;
   eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
index 92e1614..c931ac5 100644
--- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
@@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar_, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef Scalar_ Scalar;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
 
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
@@ -95,11 +92,11 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device> &tensor,
-      const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
-      : m_impl(tensor.impl().impl())
+    const TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>
+      &tensor,
+    const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
+    : m_impl(tensor.impl().impl())
   {
     Index patch_rows;
     Index patch_depth;
@@ -167,7 +164,7 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper)
-      : m_impl(base_mapper.m_impl)
+    : m_impl(base_mapper.m_impl)
   {
     m_patch_cols = base_mapper.m_patch_cols;
     m_num_patches = base_mapper.m_num_patches;
@@ -280,11 +277,10 @@ public:
 
 private:
   friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
 
   // Load coefficient from a patch specified by the "within patch offset"
   // (patchId) and the precomputed indices of the first element of the patch.
@@ -298,14 +294,14 @@ private:
     const Index colOffset = patchOffset / m_fastColStride;
     const Index inputCol = colIndex + colOffset * m_in_col_strides;
     const Index origInputCol = (m_patch_col_inflate_strides == 1)
-                                   ? inputCol
-                                   : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+                                 ? inputCol
+                                 : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
 
     const Index rowOffset = patchOffset - colOffset * m_colStride;
     const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
     const Index origInputRow = (m_patch_row_inflate_strides == 1)
-                                   ? inputRow
-                                   : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+                                 ? inputRow
+                                 : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
     if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
         origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) ||
         (inputRow != origInputRow * m_patch_row_inflate_strides))
@@ -314,7 +310,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+      depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -338,7 +334,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -390,7 +386,7 @@ private:
       // span[0] all the way upto (and including) span[1].
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
       return m_impl.template partialPacket<Packet>(inputIndex - span[0],
                                                    mask<Packet>(span[0], span[1] + 1));
     }
@@ -445,10 +441,10 @@ private:
 
     // Load partial packets and do bit-wise OR to generate required packet
     return internal::por<Packet>(
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
-                                  patchOffsets2Cols[0], colOffsets[0]),
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
-                                  patchOffsets2Cols[1], colOffsets[1]));
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
+                                patchOffsets2Cols[0], colOffsets[0]),
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
+                                patchOffsets2Cols[1], colOffsets[1]));
   }
 
   // Helper function to load a packet that is present in a single columns.
@@ -477,7 +473,7 @@ private:
       // no padding
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
       return m_impl.template packet<Unaligned>(inputIndex);
     }
     return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
@@ -490,7 +486,7 @@ private:
   // load.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<Packet>::size;
@@ -538,7 +534,7 @@ private:
   // packets.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<PacketT>::size;
@@ -604,7 +600,7 @@ private:
     // no padding
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.template packet<Unaligned>(inputIndex);
   }
 
@@ -627,10 +623,10 @@ private:
   computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const
   {
     const size_t NumInputDims =
-        array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+      array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
     otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
     const Index patch2DIndex =
-        (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+      (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
     otherIndex *= m_patchInputStride;
     colIndex = patch2DIndex / m_fastOutputRows;
     rowIndex = patch2DIndex - colIndex * m_outputRows;
@@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      ParentMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    ParentMapper;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef Self LinearMapper;
 
@@ -722,16 +715,16 @@ public:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
+    : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper)
+    : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+      m_col_offset(horiz_offset + base_mapper.m_col_offset),
+      m_base_mapper(base_mapper.m_base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
@@ -766,7 +759,7 @@ public:
   {
     typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
     return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
-        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+      i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; }
 
@@ -781,7 +774,7 @@ public:
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const
   {
     const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
@@ -789,8 +782,8 @@ public:
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const
   {
     const Index max_row =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
-        fastPatchRowStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
+      fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
@@ -862,7 +855,7 @@ public:
   }
   template <typename PacketT = Packet>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const
   {
     const Index inputIndex = depth + baseIndex;
@@ -913,8 +906,8 @@ public:
 
     const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
     *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
-                    ? input_row
-                    : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
+                  ? input_row
+                  : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
 
     return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
            (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
@@ -932,8 +925,8 @@ public:
 
     const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
     *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
-                    ? input_col
-                    : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
+                  ? input_col
+                  : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
 
     return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
            (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
@@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1159,7 +1149,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1378,7 +1366,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
 
   EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1582,27 +1568,25 @@ struct gemm_pack_rhs<
  */
 template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<Input>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const OutputKernel>>,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const OutputKernel>>>::type
+  internal::traits<Input>::Layout == ColMajor,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const OutputKernel>>,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const OutputKernel>>>::type
 SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1,
                    const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME,
                    const Index row_in_stride = 1, const Index col_in_stride = 1,
@@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions,
                    internal::traits<Input>::Layout, TensorIndex>>
-      in(input);
+    in(input);
   TensorRef<
-      Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
-             internal::traits<Kernel>::Layout, TensorIndex>>
-      kern(kernel);
+    Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
+           internal::traits<Kernel>::Layout, TensorIndex>>
+    kern(kernel);
 
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
                       YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   }
   if (padding_explicit)
   {
-    return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride,
-                                                 /*row_inflate_stride=*/1,
-                                                 /*col_inflate_stride=*/1, padding_top,
-                                                 padding_bottom, padding_left, padding_right,
-                                                 /*padding_value=*/0)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(
-                kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride,
-                /*row_inflate_stride=*/1,
-                /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right,
-                /*padding_value=*/0)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+    return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                  kernel.reshape(kernel_dims)
+                    .contract(input
+                                .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                                       col_stride, row_in_stride, col_in_stride,
+                                                       /*row_inflate_stride=*/1,
+                                                       /*col_inflate_stride=*/1, padding_top,
+                                                       padding_bottom, padding_left, padding_right,
+                                                       /*padding_value=*/0)
+                                .reshape(pre_contract_dims),
+                              contract_dims, output_kernel)
+                    .reshape(post_contract_dims),
+                  input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride,
+                                           /*row_inflate_stride=*/1,
+                                           /*col_inflate_stride=*/1, padding_top, padding_bottom,
+                                           padding_left, padding_right,
+                                           /*padding_value=*/0)
+                    .reshape(pre_contract_dims)
+                    .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+                    .reshape(post_contract_dims));
   }
   else
   {
     return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride, padding_type)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
-                                   col_in_stride, padding_type)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+        .contract(input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride, padding_type)
+                    .reshape(pre_contract_dims),
+                  contract_dims, output_kernel)
+        .reshape(post_contract_dims),
+      input
+        .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+                               col_in_stride, padding_type)
+        .reshape(pre_contract_dims)
+        .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+        .reshape(post_contract_dims));
   }
 }
 
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index 6149caf..a70e39c 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams &params, const Shape &input_shape, cons
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
index e33b2fb..980ad48 100644
--- a/compute/cker/include/cker/operation/BatchToSpaceND.h
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_
   // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
   // end_index is exclusive).
   *end_index =
-      std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+    std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
 }
 
 template <typename T>
@@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1
       for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
       {
         const int out_w =
-            in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+          in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
         assert(out_w >= 0);
         assert(out_w < output_width);
         T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index d9917a9..fe5f877 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
   // From this point it is assumed contractually that corresponding dimensions
   // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
   const bool swap_inputs =
-      params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
+    params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
   const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
   const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
 
@@ -281,8 +281,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
       optimized::BroadcastMulDispatchQuant8(
-          params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-          const_cast<uint8_t *>(input2_data), output_shape, output_data);
+        params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
+        const_cast<uint8_t *>(input2_data), output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
     case nnfw::cker::BinaryArithmeticOpType::POW:
@@ -320,8 +320,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
       break;
     case nnfw::cker::BinaryArithmeticOpType::POW:
       reference::BroadcastBinaryArithmeticOpSlow<float>(
-          params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-          GetBinaryArtithmeticFn<op_type, float>());
+        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+        GetBinaryArtithmeticFn<op_type, float>());
       break;
     default:
       assert(false);
diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h
index d69b38a..24d4cc4 100644
--- a/compute/cker/include/cker/operation/Common.h
+++ b/compute/cker/include/cker/operation/Common.h
@@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (; i < bias_size; i++)
     {
       array_ptr[i] =
-          ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
+        ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #else // not NEON
@@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (int i = 0; i < bias_size; i++)
     {
       array_data[array_offset + i] = ActivationFunctionWithMinMax(
-          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+        array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #endif
diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h
index 47eb603..ac6af84 100644
--- a/compute/cker/include/cker/operation/Comparison.h
+++ b/compute/cker/include/cker/operation/Comparison.h
@@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data,
                            const Shape &output_shape, bool *output_data)
 {
   const int64_t flatsize = // number of data....
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+    MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = F(input1_data[i], input2_data[i]);
@@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams &params, const Shape &input1_
     const int32_t shifted_input1_val = input1_val * (1 << left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, input1_multiplier, input1_shift);
+      shifted_input1_val, input1_multiplier, input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, input2_multiplier, input2_shift);
+      shifted_input2_val, input2_multiplier, input2_shift);
     output_data[i] = F(scaled_input1_val, scaled_input2_val);
   }
 }
@@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           output_data[Offset(output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+            F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
         }
       }
     }
@@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           const int32_t input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+            input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32_t input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+            input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32_t shifted_input1_val = input1_val * (1 << left_shift);
           const int32_t shifted_input2_val = input2_val * (1 << left_shift);
           const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input1_val, input1_multiplier, input1_shift);
+            shifted_input1_val, input1_multiplier, input1_shift);
           const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input2_val, input2_multiplier, input2_shift);
+            shifted_input2_val, input2_multiplier, input2_shift);
           output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
         }
       }
@@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
   }
 }
 
-#define TFLITE_COMPARISON_OP(name)                                                                \
-  template <typename T>                                                                           \
-  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,    \
-                   const T *input2_data, const Shape &output_shape, bool *output_data)            \
-  {                                                                                               \
-    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,      \
-                         output_data);                                                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                    \
-                              const Shape &input2_shape, const T *input2_data,                    \
-                              const Shape &output_shape, bool *output_data)                       \
-  {                                                                                               \
-    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,             \
-                                output_shape, output_data);                                       \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##WithScaling(ComparisonParams &params, const Shape &input1_shape,              \
-                                const T *input1_data, const Shape &input2_shape,                  \
-                                const T *input2_data, const Shape &output_shape,                  \
-                                bool *output_data)                                                \
-  {                                                                                               \
-    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,           \
-                                       input2_data, output_shape, output_data);                   \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,   \
-                                               const Shape &input2_shape, const T *input2_data,   \
-                                               const Shape &output_shape, bool *output_data)      \
-  {                                                                                               \
-    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,           \
-                                               input2_data, output_shape, output_data);           \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,              \
-                                    const Shape &input2_shape, const T *input2_data,              \
-                                    const Shape &output_shape, bool *output_data)                 \
-  {                                                                                               \
-    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,  \
-                                           output_shape, output_data);                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##WithScaling(ComparisonParams &params,                        \
-                                                 const Shape &input1_shape, const T *input1_data, \
-                                                 const Shape &input2_shape, const T *input2_data, \
-                                                 const Shape &output_shape, bool *output_data)    \
-  {                                                                                               \
-    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                            \
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
+#define TFLITE_COMPARISON_OP(name)                                                                 \
+  template <typename T>                                                                            \
+  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,     \
+                   const T *input2_data, const Shape &output_shape, bool *output_data)             \
+  {                                                                                                \
+    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,       \
+                         output_data);                                                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                     \
+                              const Shape &input2_shape, const T *input2_data,                     \
+                              const Shape &output_shape, bool *output_data)                        \
+  {                                                                                                \
+    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,              \
+                                output_shape, output_data);                                        \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##WithScaling(                                                                   \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,            \
+                                       input2_data, output_shape, output_data);                    \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,    \
+                                               const Shape &input2_shape, const T *input2_data,    \
+                                               const Shape &output_shape, bool *output_data)       \
+  {                                                                                                \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,            \
+                                               input2_data, output_shape, output_data);            \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,               \
+                                    const Shape &input2_shape, const T *input2_data,               \
+                                    const Shape &output_shape, bool *output_data)                  \
+  {                                                                                                \
+    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,   \
+                                           output_shape, output_data);                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##WithScaling(                                                  \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                             \
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data);    \
   }
 
 TFLITE_COMPARISON_OP(Equal);
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
index 394123e..9aaca00 100644
--- a/compute/cker/include/cker/operation/Concatenation.h
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams &params,
         for (int j = 0; j < copy_size; ++j)
         {
           const int32_t value =
-              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
           output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
         }
       }
diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h
new file mode 100644
index 0000000..e57fef0
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthToSpace.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__
+#define __NNFW_CKER_DEPTH_TO_SPACE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data,
+                         const Shape &unextended_output_shape, T *output_data, int32_t block_size)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+
+  const int output_depth = output_shape.Dims(3);
+  const int batch_size = output_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int in_h = 0; in_h < input_height; ++in_h)
+    {
+      const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h)
+      {
+        const T *src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 814a9e0..436ddd8 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -22,143 +22,159 @@
 #include "cker/Types.h"
 #include "cker/Utils.h"
 #include "cker/neon/neon_check.h"
+#include "cker/operation/optimized/DepthwiseConvFloat.h"
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw
 {
 namespace cker
 {
 
-inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const uint8_t *input_data, const Shape &filter_shape,
-                          const uint8_t *filter_data, const Shape &bias_shape,
-                          const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+// TODO(luwa): add multithread to per-channel depthwise_conv
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
 {
-  const int depth_multiplier = params.depth_multiplier;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  assert(dilation_width_factor >= 1);
-  assert(dilation_height_factor >= 1);
-  UNUSED_RELEASE(dilation_width_factor);
-  UNUSED_RELEASE(dilation_height_factor);
-  assert(input_shape.DimensionsCount() == 4);
-  assert(filter_shape.DimensionsCount() == 4);
-  assert(output_shape.DimensionsCount() == 4);
-  assert(output_activation_min <= output_activation_max);
-  UNUSED_RELEASE(output_activation_min);
-  UNUSED_RELEASE(output_activation_max);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_depth = input_shape.Dims(3);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(input_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(depth_multiplier);
-
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__aarch64__)
-//  TODO Use below codes
-
-//  const int stride_width = params.stride_width;
-//  const int stride_height = params.stride_height;
-//  const int pad_width = params.padding_values.width;
-//  const int pad_height = params.padding_values.height;
-//  const int output_shift = params.output_shift;
-//
-//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
-//  // parameters are supported.
-//  if (Fast3x3FilterKernelSupported(
-//          input_shape, filter_shape, stride_width, stride_height,
-//          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
-//          depth_multiplier, output_shape, output_shift)) {
-//    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-//                           filter_data, bias_shape, bias_data, output_shape,
-//                           output_data);
-//    return;
-//  }
-#endif
-
-  optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
-                                  bias_shape, bias_data, output_shape, output_data);
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), input_shape_(input_shape), input_data_(input_data),
+      filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
+      bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
+      thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
+                                 bias_shape_, bias_data_, output_shape_, output_data_,
+                                 thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  // const CpuFlags& cpu_flags_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
+{
+  // How many scalar multiplications are needed to make it worth using one
+  // more thread
+  static constexpr int kMinMulPerThread = 1 << 13; // 8k
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+  // Try to avoid real runtime divisions if possible by dividing by a
+  // compile-time constant.
+  int thread_count = std::max(1, num_muls / kMinMulPerThread);
+  return thread_count;
+}
+
+inline bool MultithreadAlongBatches(int thread_count, int batches)
+{
+  assert(thread_count >= 2);
+  // If there are fewer batch entries than the number of threads we want to use,
+  // then better do intra-batch-entry multithreading.
+  if (batches < thread_count)
+  {
+    return false;
+  }
+  // If there are at least 2 batch entries to be handed to each thread, then
+  // it's safe to proceed with batch-wise multithreading: each thread will have
+  // approximately equal number of batch entries to handle, so the load
+  // balancing will be reasonable, and the amount to which the load is not
+  // perfectly balanced will be offset by the inherent advantages of
+  // batch-wise multithreading (each thread is more efficient thanks to working
+  // on larger buffers with less boundary-handling overhead).
+  if (batches >= 2 * thread_count)
+  {
+    return true;
+  }
+  // In the limit case were there are at least 1 but not much more than 1
+  // batch entries per thread, it may be a good idea to do per-batch
+  // multithreading if the number of batch entries is a multiple of the number
+  // of threads, so that each thread will have the same number of batch entries
+  // to process.
+  return ((batches % thread_count) == 0);
 }
 
+template <typename T, typename TS>
 inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const float *input_data, const Shape &filter_shape,
-                          const float *filter_data, const Shape &bias_shape, const float *bias_data,
-                          const Shape &output_shape, float *output_data)
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, ruy::Context *ruy_context)
 {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
   assert(input_shape.DimensionsCount() == 4);
   assert(filter_shape.DimensionsCount() == 4);
   assert(output_shape.DimensionsCount() == 4);
 
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
+  int thread_count = HowManyConvThreads(output_shape, filter_shape);
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
+
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+  // Cap the number of threads to 2 for float path to avoid regression in
+  // performance (b/132294857).
+  if (std::is_floating_point<T>::value)
+  {
+    thread_count = std::min(thread_count, 2);
+  }
+
+  const int output_batches = output_shape.Dims(0);
   const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(bias_shape);
 
-  for (int b = 0; b < batches; ++b)
+  if (thread_count == 1)
+  {
+    optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+                                 bias_shape, bias_data, output_shape, output_data, 0, output_height,
+                                 1);
+    return;
+  }
+
+  int thread_dim, thread_dim_size;
+  if (MultithreadAlongBatches(thread_count, output_batches))
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_height;
+  }
+
+  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int ic = 0; ic < input_depth; ++ic)
-        {
-          for (int m = 0; m < depth_multiplier; m++)
-          {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            float total = 0.f;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
-            {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
-              {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
-                {
-                  float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-            float bias_value = 0.0f;
-            if (bias_data)
-            {
-              bias_value = bias_data[oc];
-            }
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
-                total + bias_value, output_activation_min, output_activation_max);
-          }
-        }
-      }
-    }
+    int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+    tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                       bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    thread_start = thread_end;
   }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h
new file mode 100644
index 0000000..6bdd7c6
--- /dev/null
+++ b/compute/cker/include/cker/operation/ELU.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ELU_H__
+#define __NNFW_CKER_ELU_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ELU_H__
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 3d1837f..13fccfd 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -394,8 +394,8 @@ private:
     for (int label = 0; label < num_labels; ++label)
     {
       bool removed = (_output_label_counts[label] == 0);
-      bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 ||
-                    _input_label_counts[1][label] == 0;
+      bool unique =
+        num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0;
       _label_types[label] = getDimensionType(removed, unique);
     }
   }
@@ -483,8 +483,8 @@ private:
       if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size())
       {
         throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " +
-                                 std::to_string(labels->size() - 1) + " but got: " +
-                                 std::to_string(inputs[i].shape.DimensionsCount())};
+                                 std::to_string(labels->size() - 1) +
+                                 " but got: " + std::to_string(inputs[i].shape.DimensionsCount())};
       }
       int ellipsis_axis = -1;
       const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1;
@@ -511,7 +511,7 @@ private:
     }
 
     std::vector<bool>::iterator it_input =
-        std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
+      std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
     if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis)
     {
       return;
@@ -645,11 +645,11 @@ private:
 
     // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
     const int32_t output_size =
-        reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
+      reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
     functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce(
-        device, output->shaped<T, 1>({output_size}),
-        input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
-        Reducer());
+      device, output->shaped<T, 1>({output_size}),
+      input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
+      Reducer());
   }
 
   bool shouldSwapFreeAndContract(const Labels &labels,
@@ -779,7 +779,7 @@ private:
     {
       const int32_t count = label_counts[label];
       const int current_axis =
-          should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
+        should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
       const int32_t dim = input.shape.Dims(current_axis);
       strided_shape_dims.push_back(dim);
       inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim);
@@ -879,7 +879,7 @@ private:
     for (size_t i = 0; i < inputs.size(); ++i)
     {
       const int32_t free_axis =
-          inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
+        inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
       output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis));
     }
     bool adj_x = swap_free_and_contract[0];
diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h
index 9d080d8..0e980f1 100644
--- a/compute/cker/include/cker/operation/Elementwise.h
+++ b/compute/cker/include/cker/operation/Elementwise.h
@@ -98,6 +98,28 @@ inline void Floor(const Shape &input_shape, const float *input_data, const Shape
   }
 }
 
+inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                 float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = std::sqrt(input_data[i]);
+  }
+}
+
+inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                   float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input_data[i] * input_data[i];
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index 14daf98..d657acc 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -24,27 +24,12 @@ namespace nnfw
 {
 namespace cker
 {
-template <typename T>
-inline void Fill(const Shape &input_shape, int *input_data, const T value_data,
-                 const Shape &output_shape, T output_data)
+template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data)
 {
-  int input_size = input_shape.FlatSize();
-  int output_size = 1;
-  for (int i = 0; i < input_size; i++)
+  int output_size = output_shape.FlatSize();
+  for (int i = 0; i < output_size; i++)
   {
-    output_size *= input_data[i];
-  }
-
-  if (output_size == output_shape.FlatSize())
-  {
-    for (int i = 0; i < output_size; i++)
-    {
-      output_data[i] = *value_data;
-    }
-  }
-  else
-  {
-    throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output");
+    output_data[i] = *value_data;
   }
 }
 
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 9585324..b7d27e8 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -117,7 +117,7 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
   const int output_depth =
-      MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+    MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b)
   {
@@ -229,7 +229,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
   const int weights_dims_count = weights_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
   const int output_depth =
-      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
   UNUSED_RELEASE(bias_shape);
@@ -249,7 +249,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
       {
         int idx_1 = w1_indices[pw1];
         output_data[b * output_depth + idx_0] +=
-            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+          weights_data[pw1] * input_data[b * accum_depth + idx_1];
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
index 28ae7a3..df397f7 100644
--- a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -70,7 +70,7 @@ inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
   const int weights_dims_count = weights_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
   const int output_depth =
-      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
   UNUSED_RELEASE(bias_shape);
diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h
index d17a579..8a97d84 100644
--- a/compute/cker/include/cker/operation/FusedBatchNorm.h
+++ b/compute/cker/include/cker/operation/FusedBatchNorm.h
@@ -105,7 +105,7 @@ public:
     float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
     // This adjustment is for Bessel's correction
     float rest_size_adjust =
-        static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
+      static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
 
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
@@ -117,12 +117,12 @@ public:
 
     batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
     auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
-                              .eval()
-                              .reshape(one_by_depth)
-                              .broadcast(bcast_spec);
+                            .eval()
+                            .reshape(one_by_depth)
+                            .broadcast(bcast_spec);
     auto x_scaled = x_centered * scaling_factor;
     auto x_shifted =
-        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
+      (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
 
     UNUSED_RELEASE(rest_size_adjust);
 
diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h
index a0abf29..211db98 100644
--- a/compute/cker/include/cker/operation/Helper/BCast.h
+++ b/compute/cker/include/cker/operation/Helper/BCast.h
@@ -22,7 +22,7 @@
  * ToDo : This file will be moved into upper folder when integrate with other
  *        custom operations.
  *        And It should merged with EinsumHelper's BCast.
-**/
+ **/
 
 #include "cker/Shape.h"
 #include "cker/eigen/EigenSupport.h"
@@ -393,7 +393,7 @@ public:
 
   BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true,
         const bool return_flattened_batch_indices = false)
-      : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
+    : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
   {
   }
 
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index baeafd7..cbebff1 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -168,7 +168,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int32_t lo, int32_t hi)
-      : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+    : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
   {
   }
 
@@ -207,7 +207,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int64_t lo, int64_t hi)
-      : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+    : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
   {
   }
 
@@ -291,22 +291,22 @@ public:
 
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int32_t>
-    : public UniformFullIntDistribution32<Generator, int32_t>
+  : public UniformFullIntDistribution32<Generator, int32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint32_t>
-    : public UniformFullIntDistribution32<Generator, uint32_t>
+  : public UniformFullIntDistribution32<Generator, uint32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int64_t>
-    : public UniformFullIntDistribution64<Generator, int64_t>
+  : public UniformFullIntDistribution64<Generator, int64_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint64_t>
-    : public UniformFullIntDistribution64<Generator, uint64_t>
+  : public UniformFullIntDistribution64<Generator, uint64_t>
 {
 };
 
@@ -324,7 +324,7 @@ public:
 
   PHILOX_DEVICE_INLINE
   explicit SingleSampleAdapter(Generator *gen)
-      : generator_(gen), used_result_index_(Generator::kResultElementCount)
+    : generator_(gen), used_result_index_(Generator::kResultElementCount)
   {
   }
 
@@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double>
 public:
   // The number of elements that will be returned.
   static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
-                                                 ? SingleSampleGenerator::kNativeElementCount / 2
-                                                 : 1;
+                                               ? SingleSampleGenerator::kNativeElementCount / 2
+                                               : 1;
   // Cost of generation of a single element (in cycles).
   static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 85d2677..6e9ffbd 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
   {
     const int kGroupSize = Distribution::kResultElementCount;
     static const int kGeneratorSkipPerOutputGroup =
-        kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+      kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
 
     int64_t offset = 0;
 
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
index e6ac008..ec29a15 100644
--- a/compute/cker/include/cker/operation/Helper/Tensor.h
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str
 {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      Tensor;
+    Tensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      ConstTensor;
+    ConstTensor;
 
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
-      UnalignedConstTensor;
+    UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned>
-      Tensor32Bit;
+    Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      Scalar;
+    Scalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+    ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedScalar;
+    UnalignedScalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+    UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstFlat;
+    ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstVec;
+    ConstVec;
 
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedConstFlat;
+    UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstMatrix;
+    ConstMatrix;
 
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
-      UnalignedConstMatrix;
+    UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
index 6445e8a..8fa8b03 100644
--- a/compute/cker/include/cker/operation/InstanceNorm.h
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_sh
           double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
           double output_value = input_value * a + b;
           output_data[Offset(output_shape, batch, height, width, channel)] =
-              ActivationFunctionWithMinMax((float)output_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
index a0075c3..c1fca91 100644
--- a/compute/cker/include/cker/operation/L2Normalize.h
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uin
     {
       int32_t diff = *input_data - input_zero_point;
       int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
       int32_t unclamped_output_val = 128 + rescaled_diff;
       int32_t output_val = std::min(static_cast<int32_t>(255),
                                     std::max(static_cast<int32_t>(0), unclamped_output_val));
diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h
index 27beaae..a8f1f8c 100644
--- a/compute/cker/include/cker/operation/LSTM.h
+++ b/compute/cker/include/cker/operation/LSTM.h
@@ -283,23 +283,23 @@ void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float
 // contiguous, and we manually loop over the batched outputs.
 // LINT.IfChange
 inline void LstmStepFloat(
-    const float *input_ptr, const float *input_to_input_weights_ptr,
-    const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
-    const float *input_to_output_weights_ptr, const float *aux_input_ptr,
-    const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
-    const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
-    const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
-    const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
-    const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
-    const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
-    const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
-    const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
-    const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
-    const float *output_gate_bias_ptr, const float *projection_weights_ptr,
-    const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell,
-    int n_input, int n_aux_input, int n_output, int output_batch_leading_dim,
-    float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1,
-    float *scratch2, float *scratch3, float *output_ptr)
+  const float *input_ptr, const float *input_to_input_weights_ptr,
+  const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
+  const float *input_to_output_weights_ptr, const float *aux_input_ptr,
+  const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
+  const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
+  const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
+  const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
+  const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
+  const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
+  const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
+  const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
+  const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
+  const float *output_gate_bias_ptr, const float *projection_weights_ptr,
+  const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input,
+  int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr,
+  float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3,
+  float *output_ptr)
 {
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -314,7 +314,7 @@ inline void LstmStepFloat(
   // Check if inputs are all zeros so we can skip some computations.
   const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
   const bool is_aux_input_all_zeros =
-      (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+    (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
   if (!use_cifg)
   {
     // Calculate the input gate. (If not CIFG.)
@@ -336,11 +336,11 @@ inline void LstmStepFloat(
                          forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
   // Calculate the cell update gate.
   CalculateLstmGateFloat(
-      input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
-      output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
-      /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
-      n_batch, n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
-      is_input_all_zeros, is_aux_input_all_zeros);
+    input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
+    output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+    /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
+    n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
+    is_input_all_zeros, is_aux_input_all_zeros);
   // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
                       cell_gate_scratch, use_cifg, params->cell_clip);
diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h
new file mode 100644
index 0000000..e12d01b
--- /dev/null
+++ b/compute/cker/include/cker/operation/LeakyReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LEKAY_RELU_H__
+#define __NNFW_CKER_LEKAY_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LeakyReLU(const LeakyReluParams &params, const Shape &input_shape,
+                      const float *input_data, const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU_H__
diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h
index 326a44f..eb7bdd9 100644
--- a/compute/cker/include/cker/operation/LogSoftMax.h
+++ b/compute/cker/include/cker/operation/LogSoftMax.h
@@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams &params, const Shape &input_shape,
       for (int c = 0; c < depth; ++c)
       {
         output_data[(i * depth + c) * inner_size + j] =
-            (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
+          (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
       }
     }
   }
@@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams &params, float input_scale, const Sha
       for (int c = 0; c < depth; ++c)
       {
         const float log_prob =
-            scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
+          scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
         const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
         output_data[(i * depth + c) * inner_size] =
-            static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+          static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h
new file mode 100644
index 0000000..e877f5f
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalAnd.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICAL_AND_H__
+#define __NNFW_CKER_LOGICAL_AND_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+                                const Shape &unextended_input2_shape, const T *input2_data,
+                                const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val && in2_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+                                  T *output_data)
+{
+
+  int num_elements = shape.FlatSize();
+
+  for (int t = 0; t < num_elements; t++)
+  {
+    output_data[t] = input1_data[t] && input2_data[t];
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICAL_AND_H__
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
index 5674ff3..ef28684 100644
--- a/compute/cker/include/cker/operation/MatrixBandPart.h
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
 
   if (!(num_lower_diags <= row_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
+      "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
 
   if (!(num_upper_diags <= col_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
+      "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
 
   std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
 
@@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
       auto input = input_data + (batch * row_num * col_num + row * col_num);
 
       const T band_start =
-          num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
-      const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num),
-                                                                  row + num_upper_diags + 1);
+        num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
+      const T band_end = num_upper_diags < 0
+                           ? col_num
+                           : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
 
       for (T band_idx = band_start; band_idx < band_end; band_idx++)
       {
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
index ea3fcac..5dc84d3 100644
--- a/compute/cker/include/cker/operation/MaxPool.h
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
           {
             int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
             out_mat.col(out_offset) =
-                out_mat.col(out_offset)
-                    .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+              out_mat.col(out_offset)
+                .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
           }
         }
       }
@@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams &params, const Shape &input_shape, const
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h
index c0dbc6d..ddc27b4 100644
--- a/compute/cker/include/cker/operation/OneHot.h
+++ b/compute/cker/include/cker/operation/OneHot.h
@@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax
       for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
       {
         *output_data =
-            static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
+          static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h
index 5c3a773..d6ccc68 100644
--- a/compute/cker/include/cker/operation/Range.h
+++ b/compute/cker/include/cker/operation/Range.h
@@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta)
   }
 
   int size = (std::is_integral<T>::value
-                  ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
-                  : std::ceil(std::abs((limit - start) / delta)));
+                ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+                : std::ceil(std::abs((limit - start) / delta)));
   return size;
 }
 
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index 2b2e8d3..dbf9381 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -50,7 +50,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
   {
     int r_idx = 0;
     float tmp_data[4] = {
-        0,
+      0,
     };
     float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
     for (; r_idx <= reduce_size - 32; r_idx += 32)
@@ -143,7 +143,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
@@ -319,7 +319,7 @@ public:
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           const U value =
-              static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+            static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
           output_data[idx] = static_cast<T>(value);
         }
       }
@@ -329,7 +329,7 @@ public:
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           float float_mean =
-              static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+            static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
           float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
                                   static_cast<float>(std::numeric_limits<T>::max()));
           result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h
index 2e4fc62..924e850 100644
--- a/compute/cker/include/cker/operation/ReduceMean.h
+++ b/compute/cker/include/cker/operation/ReduceMean.h
@@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] =
-        reducer(output_data[output_offset], input_data[input_offset], normalizer);
+      reducer(output_data[output_offset], input_data[input_offset], normalizer);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
 }
@@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape,
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return normalizer;
@@ -185,8 +185,8 @@ public:
     }
 
     size_t normalizer =
-        ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
-                               temp_index_data(), reducer, _temp_sum.data());
+      ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+                             temp_index_data(), reducer, _temp_sum.data());
     if (num_outputs > 0)
     {
       float scale = input_scale / output_scale;
@@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca
                           sum_reducer);
 }
 
+template <typename In, typename Out>
+void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+                   Out *output_data)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batch = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(3);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_d = 0; out_d < output_depth; ++out_d)
+    {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h)
+      {
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 7fc1e91..8d9a749 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t
 
     // Bottom right corner.
     output_data[output_offset + output_x_offset + output_y_offset] =
-        (output + ((x1y0 + x1y1) / 2)) / 2;
+      (output + ((x1y0 + x1y1) / 2)) / 2;
   }
 }
 
@@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
                                    &x1);
 
         int32_t input_offset[4] = {
-            Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
-            Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+          Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+          Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
         float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
                           (1 - (input_y - y0)) * (input_x - x0),
                           (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
@@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
         {
           const T *input_ptr = &input_data[d];
           *output_ptr++ = static_cast<T>(
-              input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
-              input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+            input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+            input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
         }
       }
     }
@@ -253,16 +253,16 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
   int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   float height_scale = (params.align_corners && params.output_height > 1)
-                           ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
-                           : (static_cast<float>(input_height) / params.output_height);
+                         ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+                         : (static_cast<float>(input_height) / params.output_height);
 
   float width_scale = (params.align_corners && params.output_width > 1)
-                          ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
-                          : (static_cast<float>(input_width) / params.output_width);
+                        ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+                        : (static_cast<float>(input_width) / params.output_width);
 
   ResizeBilinearGenericSmallChannel<uint8_t>(
-      batches, input_height, input_width, depth, params.output_height, params.output_width,
-      height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+    batches, input_height, input_width, depth, params.output_height, params.output_width,
+    height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
 }
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h
index ab2de94..644fe0a 100644
--- a/compute/cker/include/cker/operation/Select.h
+++ b/compute/cker/include/cker/operation/Select.h
@@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data,
             const T *input_y_data, const Shape &output_shape, T *output_data)
 {
   const int64_t flatsize =
-      MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+    MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
@@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co
           const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
           const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h
index a072cff..ef97fd5 100644
--- a/compute/cker/include/cker/operation/Slice.h
+++ b/compute/cker/include/cker/operation/Slice.h
@@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape,
                                                                      : start_b + op_params.size[0];
   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
-                         ? input_shape.Dims(1)
-                         : start_h + op_params.size[size_count - 3];
+                       ? input_shape.Dims(1)
+                       : start_h + op_params.size[size_count - 3];
   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
-                         ? input_shape.Dims(2)
-                         : start_w + op_params.size[size_count - 2];
+                       ? input_shape.Dims(2)
+                       : start_w + op_params.size[size_count - 2];
   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
-                         ? input_shape.Dims(3)
-                         : start_d + op_params.size[size_count - 1];
+                       ? input_shape.Dims(3)
+                       : start_d + op_params.size[size_count - 1];
 
   for (int in_b = start_b; in_b < stop_b; ++in_b)
   {
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 0e0f364..620c1f9 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -65,7 +65,7 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
     for (int c = 0; c < depth; ++c)
     {
       output_data[i * depth + c] =
-          std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+        std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
     }
   }
 }
@@ -163,11 +163,11 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
       if (input_diff >= diff_min)
       {
         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
+          input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
         sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                        exp_on_negative_values(scaled_diff_f8));
+                                      exp_on_negative_values(scaled_diff_f8));
       }
     }
 
@@ -178,11 +178,11 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
     // no later adjustment will be needed.
     int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
     int32_t shifted_sum_minus_one =
-        static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
-                             (static_cast<uint32_t>(1) << 31));
+      static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));
 
     FixedPoint0 shifted_scale =
-        one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+      one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
 
     for (int c = 0; c < depth; ++c)
     {
@@ -190,16 +190,16 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
       if (input_diff >= diff_min)
       {
         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
+          input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
 
         FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
         int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
                                                              num_bits_over_unit + 31 - 8);
 
         output_data[i * depth + c] = static_cast<uint8_t>(
-            std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
+          std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
       }
       else
       {
diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h
index feeb358..aff36e2 100644
--- a/compute/cker/include/cker/operation/SpaceToBatchND.h
+++ b/compute/cker/include/cker/operation/SpaceToBatchND.h
@@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams &params, const Shape &unexte
         else
         {
           const T *in =
-              input_data + Offset(input_shape, input_batch,
-                                  (out_h * block_shape_height + shift_h) - padding_top,
-                                  (out_w * block_shape_width + shift_w) - padding_left, 0);
+            input_data + Offset(input_shape, input_batch,
+                                (out_h * block_shape_height + shift_h) - padding_top,
+                                (out_w * block_shape_width + shift_w) - padding_left, 0);
           memcpy(out, in, depth * sizeof(T));
         }
       }
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index d5952ae..cdd812a 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da
   GenerateKey(seed_t, &key, &counter);
 
   Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
-      random::PhiloxRandom(counter, key), &output_t);
+    random::PhiloxRandom(counter, key), &output_t);
 }
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h
index 1dcdd9b..4243346 100644
--- a/compute/cker/include/cker/operation/Tile.h
+++ b/compute/cker/include/cker/operation/Tile.h
@@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat
   {
     int stride_size = 0, tiled_stride_size = 0;
     std::tie(stride_size, tiled_stride_size) =
-        TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
+      TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
     copy_from_data += stride_size;
     copy_to_data += tiled_stride_size;
     total_stride_size += stride_size;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 9d8cd34..62eb432 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp
     const int total_size = shrunk_input_shape.FlatSize();
 
     const int non_flatten_size =
-        Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
+      Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
 
-                &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
+              &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
     assert(non_flatten_params.perm[0] != 0);
 
     for (int i = 0; i < total_size; i += non_flatten_size)
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
index 7db3a11..d41f860 100644
--- a/compute/cker/include/cker/operation/TransposeConv.h
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams &params, const Shape &input_
                     (out_y < output_height))
                 {
                   float input_value =
-                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
-                                                          filter_x, in_channel)];
+                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                    filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
                   output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
-                      input_value * filter_value;
+                    input_value * filter_value;
                 }
               }
             }
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index 912b01a..8c1d31b 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -130,12 +130,12 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t i
   const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
   const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
   const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    shifted_input1_val, params.input1_multiplier, params.input1_shift);
   const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    shifted_input2_val, params.input2_multiplier, params.input2_shift);
   const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
   const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               raw_sum, params.output_multiplier, params.output_shift) +
                              params.output_offset;
   const int32_t clamped_output = std::min(params.quantized_activation_max,
                                           std::max(params.quantized_activation_min, raw_output));
@@ -192,9 +192,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int16x4_t s1_narrowed = vmovn_s32(s1);
     const int16x4_t s2_narrowed = vmovn_s32(s2);
     const int16x8_t s =
-        vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
-    const uint8x8_t clamped = vmax_u8(output_activation_min_vector,
-                                      vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -205,12 +205,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, params.input1_multiplier, params.input1_shift);
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
     const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 raw_sum, params.output_multiplier, params.output_shift) +
                                params.output_offset;
     const int32_t clamped_output = std::min(params.quantized_activation_max,
                                             std::max(params.quantized_activation_min, raw_output));
@@ -387,7 +387,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
     auto a2 = vld1q_f32(input2_data + i);
     auto x = OPERATOR::calculate(a1, a2); // vaddq
     auto x_clamped =
-        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
     vst1q_f32(output_data + i, x_clamped);
   }
 #endif // USE_NEON
@@ -395,7 +395,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
   {
     auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
     output_data[i] = ACTIVATION::applyCeiling(
-        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
@@ -441,7 +441,7 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par
     auto a2 = vld1q_f32(input2_data + i);
     auto x = OPERATOR::calculate(broadcast_value_dup, a2);
     auto x_clamped =
-        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
     vst1q_f32(output_data + i, x_clamped);
   }
 #endif // USE_NEON
@@ -449,13 +449,13 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par
   {
     auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
     output_data[i] = ACTIVATION::applyCeiling(
-        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
 using BinaryOpImplFloatFuncs =
-    std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
-              void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+  std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+            void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
 
 template <class FUNC>
 inline BinaryOpImplFloatFuncs
@@ -514,23 +514,22 @@ inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
+      fn =
+        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
       return static_cast<uint8_t>(quant8_sum(params, a, b));
     };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
   }
   else
   {
     BinaryBroadcastFiveFold(
-        params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-        input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                             uint8_t *)>(AddElementwiseQuant8),
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                             uint8_t *)>(AddScalarBroadcastQuant8));
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                           uint8_t *)>(AddElementwiseQuant8),
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                           uint8_t *)>(AddScalarBroadcastQuant8));
   }
 }
 
@@ -542,7 +541,7 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a + b; };
+      [](const float &a, const float &b) -> float { return a + b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
@@ -550,10 +549,10 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   {
     auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
 
-    BinaryBroadcastFiveFold(params, params.broadcast_category ==
-                                        BroadcastableOpCategory::kSecondInputBroadcastsFast,
-                            input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                            output_data, implFuncs.first, implFuncs.second);
+    BinaryBroadcastFiveFold(
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      implFuncs.first, implFuncs.second);
   }
 }
 
@@ -580,14 +579,14 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Sh
   else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
     auto implFuncs =
-        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
     BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
                             output_shape, output_data, implFuncs.first, implFuncs.second);
   }
   else
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a - b; };
+      [](const float &a, const float &b) -> float { return a - b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
@@ -599,11 +598,11 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t i
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
   const int32_t unclamped_result =
-      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                           params.output_multiplier,
-                                                           params.output_shift);
+    params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                         params.output_multiplier,
+                                                         params.output_shift);
   const int32_t clamped_output = std::min(
-      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
 
   return clamped_output;
 }
@@ -652,8 +651,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
     const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped = vmax_u8(output_activation_min_vector,
-                                 vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    const auto clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -663,12 +662,11 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t input1_val = params.input1_offset + input1_data[i];
     const int32_t input2_val = params.input2_offset + input2_data[i];
     const int32_t unclamped_result =
-        params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                             params.output_multiplier,
-                                                             params.output_shift);
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, unclamped_result));
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
     output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
@@ -711,22 +709,21 @@ inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
+      fn =
+        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
       return static_cast<uint8_t>(quant8_mul(params, a, b));
     };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
     return;
   }
   BinaryBroadcastFiveFold(
-      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                           uint8_t *)>(MulElementwiseQuant8),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                           uint8_t *)>(MulSimpleBroadcastQuant8));
+    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                         uint8_t *)>(MulElementwiseQuant8),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                         uint8_t *)>(MulSimpleBroadcastQuant8));
 }
 
 inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -738,16 +735,16 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
   {
     // TODO: Use GetBinaryArithmeticFn
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a * b; };
+      [](const float &a, const float &b) -> float { return a * b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
     return;
   }
   auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
-  BinaryBroadcastFiveFold(params, params.broadcast_category ==
-                                      BroadcastableOpCategory::kSecondInputBroadcastsFast,
-                          input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                          output_data, implFuncs.first, implFuncs.second);
+  BinaryBroadcastFiveFold(
+    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    implFuncs.first, implFuncs.second);
 }
 
 inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -760,7 +757,7 @@ inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape
   (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 #else
   const std::function<float(const float &, const float &)> fn =
-      [](const float &a, const float &b) -> float { return a / b; };
+    [](const float &a, const float &b) -> float { return a / b; };
   reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                 output_shape, output_data, fn);
 #endif // __aarch64__
@@ -781,7 +778,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Sh
   else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
     auto implFuncs =
-        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
     BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
                             output_shape, output_data, implFuncs.first, implFuncs.second);
   }
@@ -789,7 +786,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Sh
 #endif // __aarch64__
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a / b; };
+      [](const float &a, const float &b) -> float { return a / b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h
index 0f62014..26fc443 100644
--- a/compute/cker/include/cker/operation/optimized/Conv.h
+++ b/compute/cker/include/cker/operation/optimized/Conv.h
@@ -48,7 +48,7 @@ struct GemmlowpOutputPipeline
   typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
                      gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
                      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
+    Pipeline;
   static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
                           int32_t output_multiplier, int output_left_shift,
                           int32_t output_activation_min, int32_t output_activation_max)
@@ -106,7 +106,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   const int filter_height = filter_shape.Dims(1);
   const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col =
-      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+    stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
   if (need_dilated_im2col)
   {
     assert(im2col_data);
@@ -141,7 +141,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   // the other calls commented out. This is a partial rollback of cl/196819423.
   // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
   const int gemm_input_cols =
-      gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
+    gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
   const int filter_rows = filter_shape.Dims(0);
   // See b/79927784.
   // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
@@ -156,17 +156,17 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   assert(bias_shape.FlatSize() == output_rows);
   UNUSED_RELEASE(bias_shape);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
+    filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
+    gemm_input_data, gemm_input_rows, gemm_input_cols);
   gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
                                                                            output_cols);
   const auto &output_pipeline =
-      GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
-                                      output_shift, output_activation_min, output_activation_max);
+    GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
+                                    output_shift, output_activation_min, output_activation_max);
   gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
-      output_pipeline);
+    gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
+    output_pipeline);
 }
 
 } // namespace optimized
@@ -202,10 +202,10 @@ public:
                   T *output_data, int output_height, int output_width)
   {
     const bool is_1x1_kernel =
-        (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
+      (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
     const bool is_same_height_width =
-        (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
-         pad_height == 0);
+      (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
+       pad_height == 0);
     if (is_1x1_kernel || is_same_height_width)
     {
       // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
new file mode 100644
index 0000000..d439793
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+
+template <> struct FloatDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <> struct FloatDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++)
+        {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 4>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 16>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
+                                const float *input_data, int pad_width, int depth_multiplier,
+                                int filter_width, const float *filter_data, int out_x_buffer_start,
+                                int out_x_buffer_end, int output_depth, float *acc_buffer)
+{
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
+      filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                              int input_width, const float *input_data,
+                                              int pad_width, int depth_multiplier, int filter_width,
+                                              const float *filter_data, int out_x_buffer_start,
+                                              int out_x_buffer_end, int output_depth,
+                                              float *acc_buffer)
+{
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const float *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float *bias_data, float *acc_buffer)
+{
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const float *input_data, const Shape &filter_shape,
+                              const float *filter_data, const Shape &bias_shape,
+                              const float *bias_data, const Shape &output_shape, float *output_data,
+                              int thread_start, int thread_end, int thread_dim)
+{
+  UNUSED_RELEASE(bias_shape);
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  static const int kAccBufferMaxSize = 4832;
+  float acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+
+  UNUSED_RELEASE(kAccBufferActualSize);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;       \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  float *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16)
+        {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                               vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4)
+        {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++)
+        {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min, std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // nnfw
+} // cker
+} // optimized
+
+#endif
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
index d383b12..5ca56fd 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
@@ -32,6 +32,8 @@ namespace cker
 {
 namespace optimized
 {
+namespace depthwise_conv
+{
 
 // Implementation of quantized DepthwiseConv
 
@@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
     for (int i = 0; i < 2; i++)
     {
       filter[i] =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
+        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
     }
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
@@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[0].val[i] =
-            vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
         acc[1].val[i] =
-            vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 2; i++)
@@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[2 * i + 0] =
-            vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
     // We will do that by register-level table-look-up using VTBL instructions.
     // Here we prepare the registers containing the table-lookup indices.
     static const uint8_t dup3_indices_array[3][8] = {
-        {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
     uint8x8_t dup3_indices[3];
     for (int i = 0; i < 3; i++)
     {
@@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
         for (int j = 0; j < 3; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
         }
         // Store the accumulators back to acc_buffer
         for (int i = 0; i < 2; i++)
@@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 3; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 3;
@@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
         for (int j = 0; j < 2; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
         }
         // Store the accumulators back to acc_buffer.
         for (int i = 0; i < 2; i++)
@@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
       for (; ic < input_depth; ic++)
       {
         // Load the inputs.
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 2; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 2;
@@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
-        const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++ + filter_offset;
         *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
       }
       input_ptr += input_ptr_increment;
@@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
       {
         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[4];
       for (int i = 0; i < 4; i++)
@@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
     // Load the filters, add filter_offset.
     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
     const int16x8_t filter =
-        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+      vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[2];
       for (int i = 0; i < 2; i++)
@@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
       input_ptr += input_ptr_increment;
       const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 
       // Multiply-accumulate.
@@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
       else
       {
         out_x_loop_start_unclampled =
-            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width - dilation_factor * filter_x + stride - 1) / stride;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
       }
     }
     else
@@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
-        num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
-        input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
     filter_base_ptr += output_depth;
   }
 }
@@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto
   const uint8_t *filter_base_ptr = filter_data;
   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
   {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
     const int out_x_loop_end =
-        std::min(out_x_buffer_end,
-                 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 
     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
@@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                                  const uint8_t *input_data, const Shape &filter_shape,
                                  const uint8_t *filter_data, const Shape &bias_shape,
                                  const int32_t *bias_data, const Shape &output_shape,
-                                 uint8_t *output_data)
+                                 uint8_t *output_data, int thread_start, int thread_end,
+                                 int thread_dim)
 {
   (void)bias_shape;
   const int stride_width = params.stride_width;
@@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
   assert(kAccBufferActualSize <= kAccBufferMaxSize);
   assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
   UNUSED_RELEASE(kAccBufferActualSize);
 
   // row_accum_func will point to the core accumulation function to be used
@@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
   {                                                                                               \
     row_accum_func =                                                                              \
-        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
   }
 
 #ifdef USE_NEON
@@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
 
   // Now that we have determined row_accum_func, we can start work.
-  uint8_t *output_ptr = output_data;
-  for (int b = 0; b < batches; ++b)
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  uint8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
+    for (int out_y = row_start; out_y < row_end; ++out_y)
     {
       const int in_y_origin = (out_y * stride_height) - pad_height;
       const int filter_y_start =
-          std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
       const int filter_y_end =
-          std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
-                                      dilation_height_factor);
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
            out_x_buffer_start += kOutputPixelsInAccBuffer)
       {
         const int out_x_buffer_end =
-            std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
         // We call a 'pixel' a group of activation that share all but the
         // 'depth'/'channel' coordinate. num_output_pixels is the number of
         // output pixels that we will accumulate in this loop iteration.
@@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                          filter_data + filter_y * filter_height_stride, filter_offset,
                          out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
         }
-        // Finished accumulating int32 values. Now need to convert them to
+        // Finished accumulating int32_t values. Now need to convert them to
         // the final 8bit form and store them.
         const int num_output_values = output_depth * num_output_pixels;
         int i = 0;
@@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
         }
       }
     }
+    output_ptr += batch_step;
   }
 }
 
+} // namespace depthwise_conv
+
+// template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
+                                      const uint8_t *input_data, const Shape &filter_shape,
+                                      const uint8_t *filter_data, const Shape &bias_shape,
+                                      const int32_t *bias_data, const Shape &output_shape,
+                                      uint8_t *output_data, int thread_start, int thread_end,
+                                      int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(output_activation_min <= output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(output_activation_min);
+  UNUSED_RELEASE(output_activation_max);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+//  TODO Use below codes
+//  // Dispatch to dot-product 3x3 kernels when supported.
+//
+//  ruy::Context *ruy_context = cpu_backend_context->ruy_context();
+//  const bool has_dot_product_instructions =
+//      ruy_context != nullptr &&
+//      (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+//  if (has_dot_product_instructions)
+//  {
+//    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+//    DotProduct3x3KernelType kernel_type =
+//    optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+//        input_shape, filter_shape, params);
+//    if (kernel_type != DotProduct3x3KernelType::kNone)
+//    {
+//      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+//          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
+//                                                              filter_shape, filter_data,
+//                                                              bias_shape,
+//                                                              bias_data, output_shape,
+//                                                              output_data);
+//      return;
+//    }
+//  }
+//
+//  // Dispatch to non-dot-product 3x3 kernels when supported.
+//
+//  const int stride_width = params.stride_width;
+//  const int stride_height = params.stride_height;
+//  const int pad_width = params.padding_values.width;
+//  const int pad_height = params.padding_values.height;
+//  const int output_shift = params.output_shift;
+//
+//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+//  // parameters are supported.
+//  if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
+//                                                   stride_height, dilation_width_factor,
+//                                                   dilation_height_factor, pad_width, pad_height,
+//                                                   depth_multiplier, output_shape, output_shift))
+//  {
+//    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+//        params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+//        output_shape, output_data, thread_start, thread_end, thread_dim);
+//    return;
+//  }
+#endif
+
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
+                                       bias_shape, bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const uint8_t *input_data, const Shape &filter_shape,
+                              const uint8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              uint8_t *output_data, int thread_start, int thread_end,
+                              int thread_dim)
+{
+  return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
+                                   bias_shape, bias_data, output_shape, output_data, thread_start,
+                                   thread_end, thread_dim);
+}
+
 } // namespace optimized
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
index ae1f9e7..f5edc94 100644
--- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
+++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
@@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h,
   {
     const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
     const int bottom_start =
-        output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+      output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
     memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
   }
 }
@@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *
   for (int batch = 0; batch < batches; ++batch)
   {
     const T zero_byte =
-        zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+      zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
     for (int out_y = 0; out_y < output_height; ++out_y)
     {
       for (int out_x = 0; out_x < output_width; ++out_x)
diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h
index e8ffd40..1b3020d 100644
--- a/compute/cker/include/cker/operation/reference/BatchMatMul.h
+++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h
@@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha
       {
         const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        float *out_ptr =
-            output_data +
-            ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols;
+        float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                                         lhs_rows * rhs_cols;
         for (int j = 0; j < rhs_cols; ++j)
         {
           for (int i = 0; i < lhs_rows; ++i)
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index f7e3924..93cb21e 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -56,17 +56,16 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
   const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < size; i++)
   {
-    output_data[i] =
-        ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
-                                     params.float_activation_min, params.float_activation_max);
+    output_data[i] = ActivationFunctionWithMinMax(
+      fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max);
   }
 }
 
 template <typename T>
 inline void BroadcastBinaryArithmeticOpSlowQuant8(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
-    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
-    const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+  const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+  const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -100,10 +99,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax<uint8_t>(
-                  fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-                  params.quantized_activation_min, params.quantized_activation_max);
+            ActivationFunctionWithMinMax<uint8_t>(
+              fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -143,9 +142,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
           output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.quantized_activation_min, params.quantized_activation_max);
+            fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -154,9 +153,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
 
 template <>
 inline void BroadcastBinaryArithmeticOpSlow(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
-    const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
-    float *output_data, const std::function<float(const float &, const float &)> &fn)
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+  const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+  float *output_data, const std::function<float(const float &, const float &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -171,10 +170,10 @@ inline void BroadcastBinaryArithmeticOpSlow(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.float_activation_min, params.float_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                                            input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+                                         params.float_activation_min, params.float_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 86e8b51..43a5bf2 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -98,8 +98,8 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const float
             bias_value = bias_data[out_channel];
           }
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
@@ -183,7 +183,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8_t>(acc);
+            static_cast<uint8_t>(acc);
         }
       }
     }
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 7b4ff20..62eeaf6 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -52,7 +52,7 @@ void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
                    ruy::Matrix<Scalar> *dst, bool use_caching = false)
 {
   ruy::Order ruy_order =
-      params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+    params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
   ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
   // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
   // It does care whether we assign to it a Scalar* or a const Scalar*.
diff --git a/compute/ruy/CMakeLists.txt b/compute/ruy/CMakeLists.txt
new file mode 100644
index 0000000..d98ee1c
--- /dev/null
+++ b/compute/ruy/CMakeLists.txt
@@ -0,0 +1,11 @@
+nnfw_find_package(Ruy REQUIRED)
+
+add_library(nnfw_lib_ruy INTERFACE)
+target_link_libraries(nnfw_lib_ruy INTERFACE ruy)
+target_link_libraries(nnfw_lib_ruy INTERFACE ruy_instrumentation)
+target_compile_definitions(nnfw_lib_ruy INTERFACE USE_RUY_GEMV)
+if(PROFILE_RUY)
+  target_link_libraries(nnfw_lib_ruy INTERFACE ruy_profiler)
+endif(PROFILE_RUY)
+
+target_include_directories(nnfw_lib_ruy INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/compute/ruy/include/ruy/NeonTensorUtils.h b/compute/ruy/include/ruy/NeonTensorUtils.h
new file mode 100644
index 0000000..fb8b0a3
--- /dev/null
+++ b/compute/ruy/include/ruy/NeonTensorUtils.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_NEON_TENSOR_UTILS_H__
+#define __NNFW_RUY_NEON_TENSOR_UTILS_H__
+
+#include "ruy/neon/neon_check.h"
+
+#ifdef USE_NEON
+
+#define kFloatWeightsPerNeonLane 4
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool NeonIsZeroVector(const float *vector, int v_size)
+{
+  // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
+  // use the main vectorized loop, and we need to process sequentially.
+  // postamble_start shows the start index where this should happen.
+  const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
+  {
+    const float32x4_t i_x4_float = vld1q_f32(vector + v);
+    uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
+    if (vgetq_lane_u32(cmp_result, 0) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 1) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 2) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 3) == 0)
+      return false;
+  }
+
+  // Postamble loop
+  for (int v = postamble_start; v < v_size; ++v)
+  {
+    if (vector[v] != 0.0)
+      return false;
+  }
+  return true;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // USE_NEON
+
+#endif // __NNFW_RUY_NEON_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/PortableTensorUtils.h b/compute/ruy/include/ruy/PortableTensorUtils.h
new file mode 100644
index 0000000..2d2c36c
--- /dev/null
+++ b/compute/ruy/include/ruy/PortableTensorUtils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
+#define __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool PortableIsZeroVector(const float *vector, int v_size)
+{
+  for (int i = 0; i < v_size; ++i)
+  {
+    if (*vector++ != 0.0f)
+      return false;
+  }
+  return true;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/RuySupport.h b/compute/ruy/include/ruy/RuySupport.h
new file mode 100644
index 0000000..7086a96
--- /dev/null
+++ b/compute/ruy/include/ruy/RuySupport.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_RUY_SUPPORT_H__
+#define __NNFW_RUY_RUY_SUPPORT_H__
+
+#include <util/ConfigSource.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
+#include "Types.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+namespace ruy_support
+{
+
+inline ::ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
+{
+  switch (cache_policy)
+  {
+    case CachePolicy::kNeverCache:
+      return ::ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ::ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ::ruy::CachePolicy::kAlwaysCache;
+    default:
+      assert(false);
+      return ::ruy::CachePolicy::kNeverCache;
+  }
+}
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
+                   ::ruy::Matrix<Scalar> *dst, bool use_caching = false)
+{
+  ::ruy::Order ruy_order =
+    params.order == Order::kColMajor ? ::ruy::Order::kColMajor : ::ruy::Order::kRowMajor;
+  ::ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching)
+  {
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+  }
+}
+
+template <typename GemmParamsType, typename RuySpecType>
+void MakeRuyMulParams(const GemmParamsType &params, RuySpecType *ruy_mul_params)
+{
+  // This validation has already been performed by the Gemm API entry point,
+  // but it doesn't hurt to test specifically this again here, where it's
+  // being used.
+  ValidateGemmParams(params);
+
+  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+  ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+  ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
+}
+
+} // namespace ruy_support
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_RUY_SUPPORT_H__
diff --git a/compute/ruy/include/ruy/Shape.h b/compute/ruy/include/ruy/Shape.h
new file mode 100644
index 0000000..981c5b4
--- /dev/null
+++ b/compute/ruy/include/ruy/Shape.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_SHAPE_H__
+#define __NNFW_RUY_SHAPE_H__
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+#include <vector>
+
+#define UNUSED_RELEASE(a) (void)(a)
+
+namespace nnfw
+{
+namespace ruy
+{
+
+class Shape
+{
+public:
+  // Shapes with dimensions up to 5 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 5;
+
+  Shape &operator=(Shape const &) = delete;
+
+  Shape() : _size(0) {}
+
+  explicit Shape(int dimensions_count) : _size(dimensions_count)
+  {
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  Shape(int shape_size, int32_t value) : _size(0)
+  {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i)
+    {
+      SetDim(i, value);
+    }
+  }
+
+  Shape(int dimensions_count, const int32_t *dims_data) : _size(0)
+  {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  Shape(Shape const &other) : _size(other.DimensionsCount())
+  {
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[_size];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size);
+  }
+
+  bool operator==(const Shape &comp) const
+  {
+    return this->_size == comp._size &&
+           std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0;
+  }
+
+  ~Shape()
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+  }
+
+  inline int32_t DimensionsCount() const { return _size; }
+  inline int32_t Dims(int i) const
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i];
+  }
+  inline void SetDim(int i, int32_t val)
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer[i] = val;
+    }
+    else
+    {
+      _dims[i] = val;
+    }
+  }
+
+  inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  // The caller must ensure that the shape is no bigger than 4-D.
+  inline const int32_t *DimsDataUpTo4D() const { return _dims; }
+
+  inline void Resize(int dimensions_count)
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+    _size = dimensions_count;
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32_t *dims_data)
+  {
+    Resize(dimensions_count);
+    int32_t *dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+  }
+
+  inline void ReplaceWith(const Shape &other)
+  {
+    ReplaceWith(other.DimensionsCount(), other.DimsData());
+  }
+
+  inline void ReplaceWith(Shape &&other)
+  {
+    Resize(0);
+    std::swap(_size, other._size);
+    if (_size <= kMaxSmallSize)
+      std::copy(other._dims, other._dims + kMaxSmallSize, _dims);
+    else
+      _dims_pointer = other._dims_pointer;
+  }
+
+  template <typename T> inline void BuildFrom(const T &src_iterable)
+  {
+    const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32_t *data = DimsData();
+    for (auto it : src_iterable)
+    {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static Shape ExtendedShape(int new_shape_size, const Shape &shape)
+  {
+    return Shape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list)
+  {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline int FlatSize() const
+  {
+    int buffer_size = 1;
+    const int *dims_data = DimsData();
+    for (int i = 0; i < _size; i++)
+    {
+      const int dim = dims_data[i];
+      assert(dim >= 1);
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+  bool operator!=(const Shape &comp) const { return !((*this) == comp); }
+
+private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0)
+  {
+    assert(new_shape_size >= shape.DimensionsCount());
+    assert(new_shape_size <= kMaxSmallSize);
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i)
+    {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32_t) * shape.DimensionsCount());
+  }
+
+  int32_t _size;
+  union {
+    int32_t _dims[kMaxSmallSize];
+    int32_t *_dims_pointer{nullptr};
+  };
+};
+
+inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
+{
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  return shape1.Dims(index1);
+}
+
+template <typename... Args>
+int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args)
+{
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  return MatchingDim(shape1, index1, args...);
+}
+
+inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); }
+
+inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
+{
+  assert(shape.DimensionsCount() == 4);
+  const int *dims_data = shape.DimsDataUpTo4D();
+  assert(i0 >= 0 && i0 < dims_data[0]);
+  assert(i1 >= 0 && i1 < dims_data[1]);
+  assert(i2 >= 0 && i2 < dims_data[2]);
+  assert(i3 >= 0 && i3 < dims_data[3]);
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const Shape &shape, int *index)
+{
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+inline int FlatSizeSkipDim(const Shape &shape, int skip_dim)
+{
+  const int dims_count = shape.DimensionsCount();
+  assert(skip_dim >= 0 && skip_dim < dims_count);
+  const auto *dims_data = shape.DimsData();
+  int flat_size = 1;
+  for (int i = 0; i < dims_count; ++i)
+  {
+    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+  }
+  return flat_size;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <typename... Ts> inline bool checkMatching(const Shape &shape, Ts... check_shapes)
+{
+  const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...};
+  for (const auto &check_shape : check_shapes_array)
+  {
+    // Check matching of shapes except the case of that two shapes can be scalar
+    if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 ||
+        check_shape.FlatSize() != 1)
+    {
+      if (shape.DimensionsCount() != check_shape.DimensionsCount())
+      {
+        return false;
+      }
+      for (int i = 0; i < shape.DimensionsCount(); ++i)
+      {
+        if (shape.Dims(i) != check_shape.Dims(i))
+        {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+struct UNUSED_ALL
+{
+  template <typename... Args> UNUSED_ALL(Args const &...) {}
+};
+template <typename... Ts> inline int MatchingFlatSize(const Shape &shape, Ts... check_shapes)
+{
+  UNUSED_ALL{check_shapes...};
+  assert(checkMatching(shape, std::forward<Ts>(check_shapes)...));
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return FlatSizeSkipDim(shape, skip_dim);
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0,
+                                   const Shape &check_shape_1)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
+}
+
+inline int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0,
+                                const Shape &check_shape_1)
+{
+  const int size_1 = shape.FlatSize();
+  const int size_2 = check_shape_0.FlatSize();
+  const int size_3 = check_shape_1.FlatSize();
+  assert(size_1 == size_2);
+  assert(size_2 == size_3);
+  UNUSED_RELEASE(size_2);
+  UNUSED_RELEASE(size_3);
+  return size_1;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_SHAPE_H__
diff --git a/compute/ruy/include/ruy/TensorUtils.h b/compute/ruy/include/ruy/TensorUtils.h
new file mode 100644
index 0000000..149037c
--- /dev/null
+++ b/compute/ruy/include/ruy/TensorUtils.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_TENSOR_UTILS_H__
+#define __NNFW_RUY_TENSOR_UTILS_H__
+
+#include "ruy/PortableTensorUtils.h"
+#include "ruy/NeonTensorUtils.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool IsZeroVector(const float *vector, int v_size)
+{
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/Types.h b/compute/ruy/include/ruy/Types.h
new file mode 100644
index 0000000..b19b597
--- /dev/null
+++ b/compute/ruy/include/ruy/Types.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_TYPES_H__
+#define __NNFW_RUY_TYPES_H__
+
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+#include <limits>
+#include <string>
+#include "Shape.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+
+enum class FusedActivationFunctionType
+{
+  kNone = 0,
+  kRelu6 = 1,
+  kRelu1 = 2,
+  kRelu = 3,
+  kTanh = 4,
+  kSigmoid = 6,
+};
+
+enum class PaddingType
+{
+  kNone = 0,
+  kSame = 1,
+  kValid = 2,
+};
+
+struct PaddingValues
+{
+  int16_t width;
+  int16_t height;
+};
+
+struct ConvParams
+{
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  bool is_replaced_weights{false};
+};
+
+struct FullyConnectedParams
+{
+  FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  float weights_scale;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params - no one use this params, but ruy might use them later.
+  float float_activation_min;
+  float float_activation_max;
+  // Mark the operands as cacheable if they are unchanging, e.g. weights.
+  bool lhs_cacheable;
+  bool rhs_cacheable;
+  // FullyConnectedWeightsFormat weights_format;
+};
+
+enum class Order
+{
+  kColMajor,
+  kRowMajor
+};
+
+enum class CachePolicy : std::uint8_t
+{
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
+// MatrixParams encapsulates the parameters that Gemm needs about each
+// matrix, besides the buffer data pointer.
+// Compare to ruy::Matrix, which also encapsulates the data pointer.
+// Rationale for leaving the data pointer out of here: doing so
+// requires complicated const-correctness mechanics. See
+// ruy::ConstCheckingPtr.
+template <typename Scalar> struct MatrixParams
+{
+  // Storage layout order. For now we only do plain linear non-strided
+  // layout. It would be easy to support a stride if needed.
+  Order order = Order::kColMajor;
+  // Number of rows of the matrix.
+  int rows = 0;
+  // Number of columns of the matrix.
+  int cols = 0;
+  // The zero_point, i.e. which Scalar value is to be interpreted as zero.
+  // When Scalar is floating-point, this must be 0.
+  Scalar zero_point = 0;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
+};
+
+// Enumeration of broad categories of Gemm.
+//
+// The primary reason for this to exist is to allow Gemm to compile
+// only uniform-quantized or only per-channel-quantized code paths.
+// This is unneeded with ruy as the back-end, as this is only a runtime
+// difference in ruy, but with gemmlowp these really are separate code
+// paths and templatizing in a QuantizationFlavor is necessary to avoid
+// compiling unused gemmlowp code. Indeed, TFLite currently uses
+// uint8 with uniform quantization and int8 with per-channel quantization,
+// and does not use uint8 with per-channel. We want to avoid compiling
+// the gemmlowp uint8 per-channel path when gemmlowp is the back-end.
+//
+// It's possible to drop this in the future if gemmlowp goes away and no
+// other then-relevant backend library handles quantized paths in a way that
+// requires knowing this at compile-time.
+enum class QuantizationFlavor
+{
+  // Floating-point Gemm: the accumulators are not multiplied by any
+  // 'multiplier'.
+  kFloatingPoint,
+  // Quantized Gemm using a single multiplier for all accumulators.
+  kIntegerWithUniformMultiplier,
+  // Quantized Gemm using a separate multipliers for accumulators of each
+  // row of the destination matrix. This is what is called 'per-channel'
+  // in GemmParams. Here we use the more specific 'per-row' terminology
+  // to allow for the possibility of 'per-column' in the future, and to
+  // allow for that to be a separate code path in some back-end such as
+  // gemmlowp.
+  kIntegerWithPerRowMultiplier
+};
+
+// Additional parameters that Gemm needs, beyond what falls into
+// the MatrixParams that it takes. Compare to ruy::Spec.
+//
+// Decoupling AccumScalar from DstScalar (rather than deducing it from that)
+// is useful future-proofing. Think of a float16 path using float32 accum.
+//
+// QuantizationFlavor is passed here even though it's technically not used
+// in this class. This is so that we retain the ability in the future to
+// specialize this class for quantization flavor, and this allows for
+// Gemm to be templatized in quantization_flavor via the GemmParams that it
+// takes, allowing for automatic template parameter deduction to take place,
+// so that most call sites don't need to specify a QuantizationFlavor
+// (only those that need perchannel quantization do).
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor =
+            std::is_floating_point<AccumScalar>::value
+              ? QuantizationFlavor::kFloatingPoint
+              : QuantizationFlavor::kIntegerWithUniformMultiplier>
+struct GemmParams
+{
+  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+  // of the multiplier by which accumulators are multiplied before being casted
+  // to the destination type.
+  AccumScalar multiplier_fixedpoint = 0;
+  // Only for non-floating-point cases. The exponent part of the aforementioned
+  // multiplier.
+  int multiplier_exponent = 0;
+  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_fixedpoint.
+  const AccumScalar *multiplier_fixedpoint_perchannel = nullptr;
+  // Per-channel variant of multiplier_exponent. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_exponent.
+  //
+  // Either none or both of multiplier_exponent_perchannel and
+  // multiplier_fixedpoint_perchannel must be nullptr.
+  const int *multiplier_exponent_perchannel = nullptr;
+  // The bias vector data, if not null.
+  const AccumScalar *bias = nullptr;
+  // min clamp bound of destination values.
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                          ? -std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::lowest();
+  // max clamp bound of destination values.
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                          ? std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::max();
+};
+
+// Validates self-consistency of GemmParams.
+template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor>
+void ValidateGemmParams(const GemmParams<AccumScalar, DstScalar, quantization_flavor> &params)
+{
+  // Guard consistency of the quantized multiplier fields.
+  if (quantization_flavor == QuantizationFlavor::kFloatingPoint)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(params.multiplier_fixedpoint);
+    // Nothing to check about multiplier_exponent
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(params.multiplier_fixedpoint_perchannel);
+    assert(params.multiplier_exponent_perchannel);
+  }
+  else
+  {
+    // For the get raw accumulator case, we should make sure none of the
+    // quantization params are set.
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  UNUSED_RELEASE(params);
+}
+
+inline CachePolicy DefaultCachePolicy(bool is_constant_data)
+{
+  return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_TYPES_H__
diff --git a/compute/ruy/include/ruy/Utils.h b/compute/ruy/include/ruy/Utils.h
new file mode 100644
index 0000000..50205ab
--- /dev/null
+++ b/compute/ruy/include/ruy/Utils.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_UTILS_H__
+#define __NNFW_RUY_UTILS_H__
+
+#include "Types.h"
+#include "Shape.h"
+
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace ruy
+{
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, int b, int kheight,
+                                         int kwidth, int stride_width, int stride_height,
+                                         int pad_width, int pad_height, int in_width, int in_height,
+                                         int in_depth, int single_buffer_length, int buffer_id,
+                                         const T *in_data, T *conv_buffer_data, uint8_t zero_byte)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0)
+  {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0))
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+  else
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      if (left_padding > 0)
+      {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      if (right_padding > 0)
+      {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0)
+  {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+      output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
+  }
+}
+
+// Supports per-batch zero_byte for per-batch asymmetric quantized inputs.
+template <typename T>
+void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *input_data,
+                   const Shape &filter_shape, const Shape &output_shape, T *im2col_data,
+                   const int32_t *zero_bytes, const int zero_bytes_len)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same optimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  assert(dilation_width_factor != 1 || dilation_height_factor != 1);
+  assert(im2col_data);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const Shape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const Shape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    const T zero_byte =
+      zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        // Each im2col row is an output pixel. Arrange the input data in this
+        // row in an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height))
+          {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+              T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+              if ((in_x >= 0) && (in_x < input_width))
+              {
+                // Filter pixel is within the input, copy the input data.
+                T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                memcpy(dst, src, input_depth * sizeof(T));
+              }
+              else
+              {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, zero_byte, input_depth * sizeof(T));
+              }
+            }
+          }
+          else
+          {
+            // Filter row is outside the input, zero out the entire filter row.
+            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
+            T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void DilatedIm2col(const ConvParams &params, uint8_t zero_byte, const Shape &input_shape,
+                   const T *input_data, const Shape &filter_shape, const Shape &output_shape,
+                   T *im2col_data)
+{
+  const int32_t zero_point = static_cast<int32_t>(zero_byte);
+  DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data,
+                   &zero_point, 1);
+}
+
+template <typename T>
+void Im2col(const ConvParams &params, int kheight, int kwidth, uint8_t zero_byte,
+            const Shape &input_shape, const T *input_data, const Shape &output_shape,
+            T *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int h = 0; h < output_height; ++h)
+    {
+      for (int w = 0; w < output_width; ++w)
+      {
+        ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width,
+                                     stride_height, pad_width, pad_height, input_width,
+                                     input_height, input_depth, output_depth, buffer_id, input_data,
+                                     output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_UTILS_H__
diff --git a/compute/ruy/include/ruy/neon/neon_check.h b/compute/ruy/include/ruy/neon/neon_check.h
new file mode 100644
index 0000000..08394f2
--- /dev/null
+++ b/compute/ruy/include/ruy/neon/neon_check.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_NEON_CHECK_H__
+#define __NNFW_RUY_NEON_CHECK_H__
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+// Disable X86_NEON
+// #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#if 0
+#define USE_NEON
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+#include "NEON_2_SSE.h"
+#pragma GCC diagnostic pop
+#endif
+
+// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
+// defined, PortableSomeFunc(args) otherwise.
+#ifdef USE_NEON
+// Always use Neon code
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+
+#else
+// No NEON available: Use Portable code
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+
+#endif // defined(USE_NEON)
+
+#endif // __NNFW_RUY_NEON_CHECK_H__
diff --git a/compute/ruy/include/ruy/operation/Conv.h b/compute/ruy/include/ruy/operation/Conv.h
new file mode 100644
index 0000000..2b9c8c3
--- /dev/null
+++ b/compute/ruy/include/ruy/operation/Conv.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_CONV_H__
+#define __NNFW_RUY_CONV_H__
+
+#include "ruy/Types.h"
+#include "ruy/Shape.h"
+#include "ruy/Utils.h"
+#include "ruy/RuySupport.h"
+
+#include <ruy/ruy.h>
+#include <ruy/context.h>
+#include <iostream>
+#include <vector>
+
+namespace nnfw
+{
+namespace ruy
+{
+
+class Conv
+{
+public:
+  Conv() : _im2col_shape(4), _need_im2col(false), _prepared(false) {}
+
+  void prepare(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape,
+               uint32_t stride_width, uint32_t stride_height, uint32_t dilation_width_factor,
+               uint32_t dilation_height_factor)
+  {
+    if (!_prepared)
+    {
+      IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height,
+                       dilation_width_factor, dilation_height_factor);
+      _prepared = true;
+    }
+  }
+
+  void operator()(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                  const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                  const float *bias_data, const Shape &output_shape, float *output_data,
+                  ::ruy::Context *ruy_context)
+  {
+    if (!_prepared)
+    {
+      // This means that input or output are dynamic or filter is not constant
+      IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width,
+                       params.stride_height, params.dilation_width_factor,
+                       params.dilation_height_factor);
+      _prepared = true;
+    }
+
+    int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 0;
+
+    // Use heap if size is larger than 8MB
+    if (im2col_size > 2 * 1024 * 1024)
+    {
+      std::unique_ptr<float[]> im2col_data = std::make_unique<float[]>(im2col_size);
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, im2col_data.get(), ruy_context);
+    }
+    else if (im2col_size > 0)
+    {
+      float im2col_data[im2col_size];
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, im2col_data, ruy_context);
+    }
+    else
+    {
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, nullptr, ruy_context);
+    }
+  }
+
+private:
+  void ConvFloat(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                 const float *bias_data, const Shape &output_shape, float *output_data,
+                 const Shape &im2col_shape, float *im2col_data, ::ruy::Context *ruy_context)
+  {
+    UNUSED_RELEASE(bias_shape);
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const float output_activation_min = params.float_activation_min;
+    const float output_activation_max = params.float_activation_max;
+    assert(input_shape.DimensionsCount() == 4);
+    assert(filter_shape.DimensionsCount() == 4);
+    assert(output_shape.DimensionsCount() == 4);
+
+    // NB: the float 0.0f value is represented by all zero bytes.
+    const uint8_t float_zero_byte = 0x00;
+    const float *gemm_input_data = nullptr;
+    const Shape *gemm_input_shape = nullptr;
+    const int filter_width = filter_shape.Dims(2);
+    const int filter_height = filter_shape.Dims(1);
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_im2col =
+      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+    if (need_dilated_im2col)
+    {
+      DilatedIm2col(params, float_zero_byte, input_shape, input_data, filter_shape, output_shape,
+                    im2col_data);
+      gemm_input_data = im2col_data;
+      gemm_input_shape = &im2col_shape;
+    }
+    else if (need_im2col)
+    {
+      assert(im2col_data);
+      Im2col(params, filter_height, filter_width, float_zero_byte, input_shape, input_data,
+             im2col_shape, im2col_data);
+      gemm_input_data = im2col_data;
+      gemm_input_shape = &im2col_shape;
+    }
+    else
+    {
+      // TODO(aselle): We need to make sure to not send im2col if it is not
+      // needed.
+      assert(!im2col_data);
+      gemm_input_data = input_data;
+      gemm_input_shape = &input_shape;
+    }
+
+    const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+    int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+    int n = output_shape.Dims(3);
+    int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+    // When an optimized CBLAS implementation is not available, fall back
+    // to using cpu_backend_gemm.
+    MatrixParams<float> lhs_params;
+    lhs_params.order = Order::kRowMajor;
+    lhs_params.rows = n;
+    lhs_params.cols = k;
+    MatrixParams<float> rhs_params;
+    rhs_params.order = Order::kColMajor;
+    rhs_params.rows = k;
+    rhs_params.cols = m;
+    MatrixParams<float> dst_params;
+    dst_params.order = Order::kColMajor;
+    dst_params.rows = n;
+    dst_params.cols = m;
+    GemmParams<float, float> gemm_params;
+    gemm_params.bias = bias_data;
+    gemm_params.clamp_min = output_activation_min;
+    gemm_params.clamp_max = output_activation_max;
+
+    // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
+    ::ruy::Matrix<float> ruy_lhs;
+    ::ruy::Matrix<float> ruy_rhs;
+    ::ruy::Matrix<float> ruy_dst;
+    // Note that cache is always enabled for input and weight tensors
+    ruy_support::MakeRuyMatrix(lhs_params, filter_data, &ruy_lhs, true);
+    ruy_support::MakeRuyMatrix(rhs_params, gemm_input_data, &ruy_rhs, true);
+    ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst);
+
+    ::ruy::BasicSpec<float, float> ruy_mul_params;
+    ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
+
+    ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+  }
+
+  void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape,
+                        const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+                        uint32_t dilation_width_factor, uint32_t dilation_height_factor)
+  {
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 ||
+                                         kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1;
+
+    _need_im2col = need_dilated_im2col || need_non_dilated_im2col;
+
+    if (_need_im2col)
+    {
+      _im2col_shape.SetDim(0, output_shape.Dims(0));
+      _im2col_shape.SetDim(1, output_shape.Dims(1));
+      _im2col_shape.SetDim(2, output_shape.Dims(2));
+      _im2col_shape.SetDim(3, input_shape.Dims(3) * kernel_shape.Dims(1) * kernel_shape.Dims(2));
+    }
+  }
+
+private:
+  Shape _im2col_shape;
+  bool _need_im2col;
+  bool _prepared;
+};
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_CONV_H_
diff --git a/compute/ruy/include/ruy/operation/FullyConnected.h b/compute/ruy/include/ruy/operation/FullyConnected.h
new file mode 100644
index 0000000..59facdb
--- /dev/null
+++ b/compute/ruy/include/ruy/operation/FullyConnected.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_FULLY_CONNECTED_H__
+#define __NNFW_RUY_FULLY_CONNECTED_H__
+
+#include "ruy/Shape.h"
+#include "ruy/Types.h"
+#include "ruy/Utils.h"
+#include "ruy/RuySupport.h"
+
+#include <ruy/ruy.h>
+#include <ruy/context.h>
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+                           const float *input_data, const Shape &weights_shape,
+                           const float *weights_data, const Shape &,
+                           const float *optional_bias_data, const Shape &output_shape,
+                           float *output_data, ::ruy::Context *ruy_context)
+{
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  MatrixParams<float> rhs_params;
+  rhs_params.order = Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  rhs_params.cache_policy = DefaultCachePolicy(params.rhs_cacheable);
+  assert(input_shape.FlatSize() == (rhs_params.rows * rhs_params.cols));
+  MatrixParams<float> lhs_params;
+  lhs_params.order = Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  lhs_params.cache_policy = DefaultCachePolicy(params.lhs_cacheable);
+  MatrixParams<float> dst_params;
+  dst_params.order = Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+
+  // Below code was copied from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
+  ::ruy::Matrix<float> ruy_lhs;
+  ::ruy::Matrix<float> ruy_rhs;
+  ::ruy::Matrix<float> ruy_dst;
+  // Note that cache is always enabled for input and weight tensors
+  ruy_support::MakeRuyMatrix(lhs_params, weights_data, &ruy_lhs, true);
+  ruy_support::MakeRuyMatrix(rhs_params, input_data, &ruy_rhs, true);
+  ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst);
+
+  ::ruy::BasicSpec<float, float> ruy_mul_params;
+  ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
+
+  ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_FULLY_CONNECTED_H__
diff --git a/compute/test/cker/Range.cc b/compute/test/cker/Range.cc
index 55f4fcf..e5fe480 100644
--- a/compute/test/cker/Range.cc
+++ b/compute/test/cker/Range.cc
@@ -48,9 +48,7 @@ TEST(CKer_Operation, Range)
     const float start = 3;
     const float limit = 1;
     const float delta = -0.5;
-    std::vector<float> expected = {
-        3, 2.5, 2, 1.5,
-    };
+    std::vector<float> expected = {3, 2.5, 2, 1.5};
     std::vector<float> actual(expected.size());
     nnfw::cker::Range<float>(&start, &limit, &delta, actual.data());
 
diff --git a/docs/conf.py b/docs/conf.py
index 1185bcf..68b7d06 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@ copyright = '2020, Samsung Research & contributors'
 author = 'Samsung Research & contributors'
 
 # The full version, including alpha/beta/rc tags
-release = '1.11.1'
+release = '1.12.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/howto/how-to-add-a-new-operation.md b/docs/howto/how-to-add-a-new-operation.md
index 8ea7014..241ba6c 100644
--- a/docs/howto/how-to-add-a-new-operation.md
+++ b/docs/howto/how-to-add-a-new-operation.md
@@ -6,4 +6,4 @@
 
 ## Runtime
 
-- [How to introduce a new operatoin into runtime](how-to-introduce-a-new-operation-into-runtime.md)
+- [How to introduce a new operation into runtime](how-to-introduce-a-new-operation-into-runtime.md)
diff --git a/docs/howto/how-to-introduce-a-new-operation-into-runtime.md b/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
index f8fc020..9ab4987 100644
--- a/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
+++ b/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
@@ -24,7 +24,6 @@ onert support the operation.
       - [acl_cl](#acl_cl-1)
       - [acl_neon](#acl_neon-1)
       - [cpu](#cpu-1)
-    - [TensorRegister (in some cases)](#tensorregister-in-some-cases)
     - [ConstantInitializer (in some cases)](#constantinitializer-in-some-cases)
       - [cpu](#cpu-2)
   - [Samples (to be updated)](#samples-to-be-updated)
@@ -420,51 +419,28 @@ void visit(const ir::operation::Select &) override;
 ```cpp
 void KernelGenerator::visit(const ir::operation::Select &node)
 {
-  const auto output_index{node.getOutputs().at(ir::operation::Select::Output::OUTPUT)};
-  const auto cond_index{node.getInputs().at(ir::operation::Select::Input::COND)};
-  const auto input1_index{node.getInputs().at(ir::operation::Select::Input::INPUT1)};
-  const auto input2_index{node.getInputs().at(ir::operation::Select::Input::INPUT2)};
-
-  const auto output_backend_descr = ::onert::backend::cpu::kernel::getTensorDescriptor(
-      _ctx.at(output_index), _current_op_seq_layout);
-  const auto cond_backend_descr = ::onert::backend::cpu::kernel::getTensorDescriptor(
-      _ctx.at(cond_index), _current_op_seq_layout);
-  const auto input1_backend_descr = ::onert::backend::cpu::kernel::getTensorDescriptor(
-      _ctx.at(input1_index), _current_op_seq_layout);
-  const auto input2_backend_descr = ::onert::backend::cpu::kernel::getTensorDescriptor(
-      _ctx.at(input2_index), _current_op_seq_layout);
+  const auto output_index{node.getOutputs().at(0)};
+  const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
+  const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
+  const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto cond_alloc = _tensor_builder->at(cond_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
-  auto input2_alloc = _tensor_builder->at(input2_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto condition_tensor = _tensor_reg->getPortableTensor(condition_index);
+  auto true_tensor = _tensor_reg->getPortableTensor(true_index);
+  auto false_tensor = _tensor_reg->getPortableTensor(false_index);
 
-  auto fn = std::make_unique<::onert::backend::cpu::kernel::SelectLayer>();
+  auto fn = std::make_unique<ops::SelectLayer>();
 
-  fn->configure(cond_alloc->buffer(), cond_backend_descr, input1_alloc->buffer(),
-                input1_backend_descr, input2_alloc->buffer(), input2_backend_descr,
-                output_alloc->buffer(), output_backend_descr);
+  fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
 
-  _execution_builder->append(std::move(fn));
+  _return_fn = std::move(fn);
 }
 ```
 
-### TensorRegister (in some cases)
-
-This component registers tensors. Most tensors will be automatically registered internally. There
-are some exceptions, however, where additional implementations are required. It is the case when a
-tensor is treated unusually in its backend.
-
-The kernel of some operation has weights in `HWIO` as layout(data format) in case of that input's
-layout is `NHWC`. And, for `NCHW`, weights is `OIHW`. But TFLite model has weigths, `OHWI` for
-`NHWC` and `OIHW` for `NCHW`. Therefore, to register the appropriate tensor on the backend, you have
-to implement it additionally.
-
 ### ConstantInitializer (in some cases)
 
 This component registers function initializing constant tensors and initialize constant tensor
-layer. This is similar to TensorRegister. Most tensors will be automatically registered internally.
-And there are some exceptions.
+layer. Most tensors will be automatically registered internally. And there are some exceptions.
 
 #### cpu
 
diff --git a/docs/howto/how-to-use-specific-backend.md b/docs/howto/how-to-use-specific-backend.md
new file mode 100644
index 0000000..32e1b83
--- /dev/null
+++ b/docs/howto/how-to-use-specific-backend.md
@@ -0,0 +1,40 @@
+# How to Use Specific Backend during Inference
+
+ONE runtime has many ways to use specific backend during inference
+
+## Using NNFW API
+
+### [nnfw_set_available_backends](https://github.com/Samsung/ONE/blob/c46ddc04abdb58323fbd38389e6927f003bfaea1/runtime/onert/api/include/nnfw.h#L458)
+- Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
+- For each backend string, `libbackend_{backend}.so` will be dynamically loaded during nnfw_prepare.
+- Among the multiple backends, the 1st element is used as the default backend.
+
+### [nnfw_set_op_backend](https://github.com/Samsung/ONE/blob/c46ddc04abdb58323fbd38389e6927f003bfaea1/runtime/onert/api/include/nnfw.h#L476)
+- The backend for op has higher priority than available backends specified by nnfw_set_available_backends.
+
+## Using Environment Variable
+
+### 1. BACKENDS
+- Same as `nnfw_set_available_backends`
+- Example
+```bash
+BACKENDS=cpu ./Product/out/bin/nnpackage_run ...
+```
+
+### 2. OP_BACKEND_[OP_TYPE]
+- Same as `nnfw_set_op_backend`
+- Set backend for specific operator type
+- Example
+  - Execute `Conv2D` operator on ruy backend and others on cpu backend
+```bash
+OP_BACKEND_Conv2D=ruy BACKENDS="cpu;ruy" ./Product/out/bin/nnpackage_run ...
+```
+
+### 3. OP_BACKEND_MAP
+- Set backend for specific operator by its index
+- Format : `<op_id>=<backend>;<op_id>=<backend>...`
+- Example
+  - Execute `operator 10` on `acl_cl` backend and others on `acl_neon` backend
+```bash
+OP_BACKEND_MAP="10=acl_cl" BACKENDS="acl_neon;acl_cl" ./Product/out/bin/nnpackage_run ...
+```
diff --git a/docs/howto/index.rst b/docs/howto/index.rst
index c84902a..faeedbf 100644
--- a/docs/howto/index.rst
+++ b/docs/howto/index.rst
@@ -10,19 +10,22 @@ How To
    :maxdepth: 2
    :caption: Contents:
 
-  ./how-to-add-a-new-operation.md
   ./how-to-build-compiler.md
   ./how-to-build-package.md
   ./how-to-build-runtime.md
   ./how-to-build-runtime-tizen-gbs-rpi4.md
   ./how-to-build-runtime-using-prebuilt-docker-image.md
-  ./how-to-cross-build-runtime-for-arm.md
   ./how-to-cross-build-runtime-for-aarch64.md
   ./how-to-cross-build-runtime-for-android.md
-  ./how-to-contribute.md
-  ./how-to-make-an-application-with-runtime.md
-  ./how-to-remote-debugging-with-visual-studio-code.md
+  ./how-to-cross-build-runtime-for-arm.md
   ./how-to-run-package.md
+  ./how-to-make-an-application-with-runtime.md
   ./how-to-use-api.md
-  ./how-to-use-nnfw-api.md
   ./how-to-use-nnapi-binding.md
+  ./how-to-use-nnfw-api.md
+  ./how-to-use-specific-backend.md
+  ./how-to-contribute.md
+  ./how-to-remote-debugging-with-visual-studio-code.md
+  ./how-to-add-a-new-operation.md
+  ./how-to-introduce-a-new-operation-into-compiler.md
+  ./how-to-introduce-a-new-operation-into-runtime.md
diff --git a/docs/release/1.10/index.rst b/docs/release/1.10/index.rst
new file mode 100644
index 0000000..bc415fb
--- /dev/null
+++ b/docs/release/1.10/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.10.0.md
diff --git a/docs/release/1.11/index.rst b/docs/release/1.11/index.rst
new file mode 100644
index 0000000..2e4544a
--- /dev/null
+++ b/docs/release/1.11/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.11.0.md
diff --git a/docs/release/1.11/release-note-1.11.1.md b/docs/release/1.11/release-note-1.11.1.md
deleted file mode 100644
index 9efedf6..0000000
--- a/docs/release/1.11/release-note-1.11.1.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Release Note 1.11.1
-
-## ONE Runtime
-
-### Hot Fixes
-
-- Fix segfault due to the wrong BCQGather DynamicShapeInferer's behavior
diff --git a/docs/release/1.12/index.rst b/docs/release/1.12/index.rst
new file mode 100644
index 0000000..68b4c73
--- /dev/null
+++ b/docs/release/1.12/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.12.0.md
diff --git a/docs/release/1.12/release-note-1.12.0.md b/docs/release/1.12/release-note-1.12.0.md
new file mode 100644
index 0000000..1f13bc4
--- /dev/null
+++ b/docs/release/1.12/release-note-1.12.0.md
@@ -0,0 +1,28 @@
+# Release Note 1.12.0
+
+## ONE Compiler
+
+### Compiler Frontend
+
+- Add optimization pass: ReplaceMulAddWithDepthwiseConvPass, SubstitutePackToReshape, RemoveRedundantTranspose, ShuffleWeightTo16x1Float32Pass
+- Add quantization for InstanceNorm.
+- Fix bug of `one-import-bcq` command for `--v1`, `--v2` arguments.
+- Fix FuseBCQPass to work with inter-subgraphs in the model file and minor BCQ related optimizations.
+
+## ONE Runtime
+
+### Runtime backend operation supports more operations and types
+
+- CPU backend
+  - Concat: int8
+  - DepthToSpace: float, uint8, int8
+  - LeakyRelu: float
+- ACL-CL backend
+  - ArgMin: float, uint8, int8
+- ACL-NEON backend
+  - ArgMax: int8
+  - ArgMin: float, uint8, int8
+
+### nnpackage defines configuration file
+
+- Allow users to set configuration variable via conf file. For more information, See [nnpackage spec](../../../nnpackage/spec)
diff --git a/docs/release/1.5/index.rst b/docs/release/1.5/index.rst
new file mode 100644
index 0000000..0764bf2
--- /dev/null
+++ b/docs/release/1.5/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.5.0.md
diff --git a/docs/release/1.6/index.rst b/docs/release/1.6/index.rst
new file mode 100644
index 0000000..79389cf
--- /dev/null
+++ b/docs/release/1.6/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.6.0.md
diff --git a/docs/release/1.7/index.rst b/docs/release/1.7/index.rst
new file mode 100644
index 0000000..65a839f
--- /dev/null
+++ b/docs/release/1.7/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.7.0.md
diff --git a/docs/release/1.7/release-note-1.7.0.md b/docs/release/1.7/release-note-1.7.0.md
new file mode 100644
index 0000000..c1a4f50
--- /dev/null
+++ b/docs/release/1.7/release-note-1.7.0.md
@@ -0,0 +1,46 @@
+## Feature Highlights
+
+- **ONE** Compiler
+  - Compiler supports more operations
+  - New command line interface for user interface consistancy
+- **ONE** Runtime
+  - Runtime CPU backend supports more operations
+  - Runtime CPU backend supports more quant8 operations
+  - API changes
+  - New optimization
+  
+## ONE Compiler
+
+### Compiler supports more operations
+
+- MatrixDiag, MatrixSetDiag, ReverseSequence, ReverseV2, SegmentSum, SelectV2, SparseToDense, Where
+
+### New command line interface for user interface consistancy
+
+- one-import: imports conventional model files to circle
+   - one-import-tf: imports TensorFlow model to circle
+   - one-import-tflite: imports TensorFlow lite model to circle
+- one-optimize: circle optimize command
+- one-quantize: circle quantize command
+   - supports float32 to uint8, layer wise (for Conv series)
+- one-pack: package command
+- one-prepare-venv: prepares python virtual environment for importing TensorFlow model
+- one-codegen: backend(if available) code generator
+
+## ONE Runtime
+
+### Runtime CPU backend supports more operations
+
+- LogSoftmax, SpaceToBatchND
+
+### Runtime CPU backend supports more quant8 operations
+
+- Logistic, Mul, Tanh, SpaceToBatchND, Transpose, Sub, Max, Min, Less, Greater, GreaterEqual, LessEqual, Equal, NotEqual
+
+### API changes
+
+- Introduce basic asynchronous execution API
+
+### New optimization
+    
+- Remove dynamic tensor overhead from static models
diff --git a/docs/release/1.8/index.rst b/docs/release/1.8/index.rst
new file mode 100644
index 0000000..4dc1d5b
--- /dev/null
+++ b/docs/release/1.8/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.8.0.md
diff --git a/docs/release/1.9/index.rst b/docs/release/1.9/index.rst
new file mode 100644
index 0000000..d77012c
--- /dev/null
+++ b/docs/release/1.9/index.rst
@@ -0,0 +1,14 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu May 14 18:13:12 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.9.0.md
+  ./release-note-1.9.1.md
diff --git a/docs/release/index.rst b/docs/release/index.rst
index bb542bc..1a5a780 100644
--- a/docs/release/index.rst
+++ b/docs/release/index.rst
@@ -15,3 +15,11 @@ Release
   ./1.2/index
   ./1.3/index
   ./1.4/index
+  ./1.5/index
+  ./1.6/index
+  ./1.7/index
+  ./1.8/index
+  ./1.9/index
+  ./1.10/index
+  ./1.11/index
+  ./1.12/index
diff --git a/docs/runtime/index.rst b/docs/runtime/index.rst
index d44f822..e80dfc8 100644
--- a/docs/runtime/index.rst
+++ b/docs/runtime/index.rst
@@ -12,8 +12,9 @@ Runtime
 
   ./api.md
   ./core.md
-  ./compute.md
+  ./controlflow-operations.md
   ./executors.md
-  ./backend-api.md
   ./heterogeneous-execution.md
-  ./controlflow-operations.md
+  ./backend-api.md
+  ./compute.md
+  ./supported-operations-backend.md
diff --git a/docs/runtime/supported-operations-backend.md b/docs/runtime/supported-operations-backend.md
index bcc6355..04ece97 100644
--- a/docs/runtime/supported-operations-backend.md
+++ b/docs/runtime/supported-operations-backend.md
@@ -1,6 +1,6 @@
 # Supported Operations and backend
 
-As of 2020-11-10
+As of 2020-12-07
 
 ### Raw-data format (float32, int32, boolean, etc)
 
@@ -10,7 +10,7 @@ Abs | O | O | O
 Add | O | O | O
 AddN | O |   |
 ArgMax | O | O | O
-ArgMin | O |   |
+ArgMin | O | O | O
 AvgPool2D | O | O | O
 BatchMatmul | O |   |
 BatchToSpaceND | O | O | O
@@ -19,7 +19,7 @@ Concat | O | O | O
 Conv2D | O | O | O
 Cos | O |   |
 Custom | O |   |
-DepthToSpace |   | O | O
+DepthToSpace | O | O | O
 DepthwiseConv2D | O | O | O
 Div | O | O | O
 EmbeddingLookup |   | O | O
@@ -37,7 +37,7 @@ If | O |   |
 InstanceNormalize |   | O | O
 L2Normalization | O | O | O
 L2Pool |   | O | O
-LeakyRelu |   | O | O
+LeakyRelu | O | O | O
 Less | O | O | O
 LessEqual | O | O | O
 LocalResponseNormalize |   | O | O
@@ -89,6 +89,7 @@ SpaceToDepth | O | O | O
 Split | O | O | O
 SplitV | O |   |
 Sqrt | O | O | O
+Square | O |   |   |
 SquaredDifference | O | O | O
 Squeeze | O | O | O
 StridedSlice | O | O | O
@@ -110,14 +111,14 @@ Operation | CPU | ACL-CL | ACL-NEON
 -- | -- | -- | --
 Add | O | O | O
 ArgMax | O | O | O
-ArgMin | O |   |
+ArgMin | O | O | O
 AvgPool2D | O | O | O
 BatchToSpaceND | O | O | O
 Cast | O | O |
 Concat | O | O | O
 Conv2D | O | O | O
 Custom | O |   |
-DepthToSpace |   | O | O
+DepthToSpace | O | O | O
 DepthwiseConv2D | O | O | O
 Dequantize | O | O | O
 EmbeddingLookup |   | O | O
@@ -170,6 +171,12 @@ Unpack(Unstack) |   | O | O
 
 ### Quantization format (int8)
 
+Operation | CPU | ACL-CL | ACL-NEON
+-- | -- | -- | --
+ArgMax | O | O | O
+ArgMin | O | O | O
+Concat | O |   |
+DepthToSpace | O |   |
 Dequantize | O |   |
 Rank | O |   |
 Shape | O |   |
diff --git a/infra/cmake/packages/Fp16SourceConfig.cmake b/infra/cmake/packages/Fp16SourceConfig.cmake
new file mode 100644
index 0000000..3623fd2
--- /dev/null
+++ b/infra/cmake/packages/Fp16SourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_Fp16Source_import)
+  if(NOT ${DOWNLOAD_FP16})
+    set(Fp16Source_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_FP16})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # fp16 commit in xnnpack 8b283aa30a31
+  envoption(FP16_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.tar.gz)
+  ExternalSource_Download(FP16
+    DIRNAME FP16
+    URL ${FP16_URL})
+
+  set(Fp16Source_DIR ${FP16_SOURCE_DIR} PARENT_SCOPE)
+  set(Fp16Source_FOUND TRUE PARENT_SCOPE)
+endfunction(_Fp16Source_import)
+
+_Fp16Source_import()
diff --git a/infra/cmake/packages/FxdivSourceConfig.cmake b/infra/cmake/packages/FxdivSourceConfig.cmake
new file mode 100644
index 0000000..4427bf2
--- /dev/null
+++ b/infra/cmake/packages/FxdivSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_FxdivSource_import)
+  if(NOT ${DOWNLOAD_FXDIV})
+    set(FxdivSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_FXDIV})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # fxdiv commit in xnnpack 8b283aa30a31
+  envoption(FXDIV_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FXdiv/archive/f8c5354679ec2597792bc70a9e06eff50c508b9a.tar.gz)
+  ExternalSource_Download(FXDIV
+    DIRNAME FXDIV
+    URL ${FXDIV_URL})
+
+  set(FxdivSource_DIR ${FXDIV_SOURCE_DIR} PARENT_SCOPE)
+  set(FxdivSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_FxdivSource_import)
+
+_FxdivSource_import()
diff --git a/infra/cmake/packages/PsimdSourceConfig.cmake b/infra/cmake/packages/PsimdSourceConfig.cmake
new file mode 100644
index 0000000..1da5cdc
--- /dev/null
+++ b/infra/cmake/packages/PsimdSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_PsimdSource_import)
+  if(NOT ${DOWNLOAD_PSIMD})
+    set(PsimdSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_PSIMD})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # psimd commit in xnnpack 8b283aa30a31
+  envoption(PSIMD_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.tar.gz)
+  ExternalSource_Download(PSIMD
+    DIRNAME PSIMD
+    URL ${PSIMD_URL})
+
+  set(PsimdSource_DIR ${PSIMD_SOURCE_DIR} PARENT_SCOPE)
+  set(PsimdSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_PsimdSource_import)
+
+_PsimdSource_import()
diff --git a/infra/cmake/packages/PthreadpoolSourceConfig.cmake b/infra/cmake/packages/PthreadpoolSourceConfig.cmake
new file mode 100644
index 0000000..4e1910a
--- /dev/null
+++ b/infra/cmake/packages/PthreadpoolSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_PthreadpoolSource_import)
+  if(NOT ${DOWNLOAD_PTHREADPOOL})
+    set(PthreadpoolSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_PTHREADPOOL})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # pthreadpool commit in xnnpack 8b283aa30a31
+  envoption(PTHREADPOOL_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.tar.gz)
+  ExternalSource_Download(PTHREADPOOL
+    DIRNAME PTHREADPOOL
+    URL ${PTHREADPOOL_URL})
+
+  set(PthreadpoolSource_DIR ${PTHREADPOOL_SOURCE_DIR} PARENT_SCOPE)
+  set(PthreadpoolSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_PthreadpoolSource_import)
+
+_PthreadpoolSource_import()
diff --git a/infra/cmake/packages/XnnpackSourceConfig.cmake b/infra/cmake/packages/XnnpackSourceConfig.cmake
new file mode 100644
index 0000000..36a9204
--- /dev/null
+++ b/infra/cmake/packages/XnnpackSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_XnnpackSource_import)
+  if(NOT ${DOWNLOAD_XNNPACK})
+    set(XnnpackSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_XNNPACK})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # xnnpack commit in tflite v2.3
+  envoption(XNNPACK_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d.tar.gz)
+  ExternalSource_Download(XNNPACK
+    DIRNAME XNNPACK
+    URL ${XNNPACK_URL})
+
+  set(XnnpackSource_DIR ${XNNPACK_SOURCE_DIR} PARENT_SCOPE)
+  set(XnnpackSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_XnnpackSource_import)
+
+_XnnpackSource_import()
diff --git a/infra/command/format b/infra/command/format
index 7f37e06..c57e6dc 100644
--- a/infra/command/format
+++ b/infra/command/format
@@ -132,8 +132,6 @@ function check_cpp_files() {
   fi
 
   CLANG_FORMAT_CANDIDATES+=("clang-format-3.9")
-  CLANG_FORMAT_CANDIDATES+=("clang-format")
-
   for CLANG_FORMAT_CANDIDATE in ${CLANG_FORMAT_CANDIDATES[@]}; do
     if command_exists ${CLANG_FORMAT_CANDIDATE} ; then
       CLANG_FORMAT="${CLANG_FORMAT_CANDIDATE}"
@@ -142,14 +140,29 @@ function check_cpp_files() {
   done
 
   if [[ -z ${CLANG_FORMAT}  ]]; then
-    echo "[ERROR] clang-format is unavailable"
+    echo "[ERROR] clang-format-3.9 is unavailable"
     echo
-    echo "Please install clang-format before running format check"
+    echo "        Please install clang-format-3.9 before running format check"
     exit 1
   fi
 
+  # Migration to clang-format-8
+  # TODO Remove this after migration to clang-format-8
+  CLANG_FORMAT_8="clang-format-8"
+  if ! command_exists $CLANG_FORMAT_8_CANDIDATE; then
+    echo "[ERROR] clang-format-8 is unavailable"
+    echo
+    echo "        Please install clang-format-8 before running format check"
+    echo "        (or use latest docker image if you are using docker for format check)"
+    exit 1
+  fi
+  for DIR_CLANG_FORMAT_8 in $(git ls-files -co --exclude-standard '*/.clang-format'); do
+    DIRECTORIES_USE_CLANG_FORMAT_8+=($(dirname "${DIR_CLANG_FORMAT_8}"))
+  done
+
   # Check c++ files
   FILES_TO_CHECK_CPP=()
+  FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8=()
   for f in ${FILES_TO_CHECK[@]}; do
     # Manually ignore style checking
     if [[ ${f} == +(*/NeuralNetworks.h|*/NeuralNetworksExtensions.h) ]]; then
@@ -158,13 +171,28 @@ function check_cpp_files() {
 
     # File extension to check
     if [[ ${f} == +(*.h|*.hpp|*.cpp|*.cc|*.c|*.cl) ]]; then
-      FILES_TO_CHECK_CPP+=("${f}")
+
+      # Check clang-format-8 target files first
+      # TODO Remove this after migration to clang-format-8
+      FOUND_CLANG_8=0
+      for USE_CLANG_FORMAT_8 in ${DIRECTORIES_USE_CLANG_FORMAT_8[@]}; do
+        if [[ $f = $USE_CLANG_FORMAT_8* ]]; then
+          FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8+=("$f")
+          FOUND_CLANG_8=1
+          break
+        fi
+      done
+
+      if [[ $FOUND_CLANG_8 -ne 1 ]]; then
+        FILES_TO_CHECK_CPP+=("${f}")
+      fi
     fi
   done
 
   # Skip by '.FORMATDENY' file
   for s in ${DIRECTORIES_NOT_TO_BE_TESTED[@]}; do
     FILES_TO_CHECK_CPP=(${FILES_TO_CHECK_CPP[*]/$s*/})
+    FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8=(${FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8[*]/$s*/})
   done
 
   if [[ ${#FILES_TO_CHECK_CPP} -ne 0 ]]; then
@@ -174,6 +202,16 @@ function check_cpp_files() {
       INVALID_EXIT=${EXIT_CODE}
     fi
   fi
+
+  # Check by clang-format-8
+  # TODO Remove this after migration to clang-format-8
+  if [[ ${#FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8} -ne 0 ]]; then
+    ${CLANG_FORMAT_8} -i ${FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8[@]}
+    EXIT_CODE=$?
+    if [[ ${EXIT_CODE} -ne 0 ]]; then
+      INVALID_EXIT=${EXIT_CODE}
+    fi
+  fi
 }
 
 function check_python_files() {
diff --git a/infra/docker/bionic/Dockerfile b/infra/docker/bionic/Dockerfile
index 6a5f64a..15a91d7 100644
--- a/infra/docker/bionic/Dockerfile
+++ b/infra/docker/bionic/Dockerfile
@@ -30,7 +30,9 @@ RUN apt-get update && apt-get -qqy install libboost-all-dev libgflags-dev libgoo
 RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget zip unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive \
+    apt-get -qqy install doxygen graphviz wget zip unzip clang-format-3.9 clang-format-8 python3 python3-pip python3-venv hdf5-tools pylint curl
 RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
@@ -49,9 +51,15 @@ RUN update-alternatives --install /usr/bin/aarch64-linux-gnu-gcc aarch64-linux-g
     --slave /usr/bin/aarch64-linux-gnu-g++ aarch64-linux-gnu-g++ /usr/bin/aarch64-linux-gnu-g++-8 \
     --slave /usr/bin/aarch64-linux-gnu-gcov aarch64-linux-gnu-gcov /usr/bin/aarch64-linux-gnu-gcov-8
 
-# Install lcov 1.13-4 for gcc-8 support (installed lcov 1.13-3 can't support gcc-8)
-RUN wget http://launchpadlibrarian.net/370213541/lcov_1.13-4_all.deb
-RUN dpkg -i lcov_1.13-4_all.deb
+# Install lcov 1.14-2 for gcc-8 support
+#   Default version lcov 1.13-3 can't support gcc-8
+#   lcov 1.13-4 with gcc-8 have bug: reports no coverage for class declaration
+WORKDIR /root/lcov
+RUN wget http://archive.ubuntu.com/ubuntu/pool/universe/l/lcov/lcov_1.14-2_all.deb
+RUN apt-get update && apt-get -qqy install libperlio-gzip-perl libjson-perl
+RUN dpkg -i lcov_1.14-2_all.deb
+WORKDIR /root
+RUN rm -rf /root/lcov
 
 # Build and install google test static libraries
 WORKDIR /root/gtest
diff --git a/infra/docker/focal/Dockerfile b/infra/docker/focal/Dockerfile
index 7f5a1b9..cccf304 100644
--- a/infra/docker/focal/Dockerfile
+++ b/infra/docker/focal/Dockerfile
@@ -29,7 +29,9 @@ RUN apt-get update && apt-get -qqy install libboost-all-dev libgflags-dev libgoo
 RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools (except clang-format-3.9)
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget zip unzip python3 python3-pip python3-venv hdf5-tools pylint
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive \
+    apt-get -qqy install doxygen graphviz wget zip unzip clang-format-8 python3 python3-pip python3-venv hdf5-tools pylint curl
 RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
diff --git a/infra/docker/xenial/Dockerfile b/infra/docker/xenial/Dockerfile
index 052cc4f..ae3c464 100644
--- a/infra/docker/xenial/Dockerfile
+++ b/infra/docker/xenial/Dockerfile
@@ -19,7 +19,8 @@ RUN apt-get update && apt-get -qqy install libboost-all-dev libgflags-dev libgoo
 RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
-RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN apt-get update && \
+    apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 clang-format-8 python3 python3-pip python3-venv hdf5-tools pylint curl
 RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
diff --git a/infra/nnfw/cmake/CfgOptionFlags.cmake b/infra/nnfw/cmake/CfgOptionFlags.cmake
index 450aa21..f6ad0ca 100644
--- a/infra/nnfw/cmake/CfgOptionFlags.cmake
+++ b/infra/nnfw/cmake/CfgOptionFlags.cmake
@@ -15,12 +15,7 @@ option(ENABLE_COVERAGE "Build for coverage test" OFF)
 option(BUILD_EXT_MULTITHREAD "Build external build using multi thread" ON)
 option(BUILD_ONERT "Build onert" ON)
 option(BUILD_LOGGING "Build logging runtime" ON)
-CMAKE_DEPENDENT_OPTION(BUILD_RUNTIME_NNAPI_TEST "Build Runtime NN API Generated Test"
-                       # Set BUILD_RUNTIME_NNAPI_TEST as ON
-                       #   if CMAKE_COMPILER_IS_GNUCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.2
-                       ON "CMAKE_COMPILER_IS_GNUCC;NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.2"
-                       # Otherwise set BUILD_RUNTIME_NNAPI_TEST as OFF
-                       OFF)
+option(BUILD_RUNTIME_NNAPI_TEST "Build Runtime NN API Generated Test" ON)
 option(BUILD_RUNTIME_NNFW_API_TEST "Build Runtime NNFW API Tests" ON)
 option(BUILD_TFLITE_RUN "Build tflite-run" ON)
 option(BUILD_TFLITE_VANILLA_RUN "Build tflite-vanilla-run" OFF)
@@ -53,7 +48,6 @@ option(BUILD_MLAPSE "Build mlapse benchmark toolkit" OFF)
 #
 option(BUILD_KBENCHMARK "Build kernel benchmark tool" OFF)
 option(BUILD_OPENCL_TOOL "Build OpenCL tool" OFF)
-option(BUILD_NNAPI_QUICKCHECK "Build NN API Quickcheck tools" OFF)
 option(BUILD_TFLITE_ACCURACY "Build tflite accuracy tool" OFF)
 #
 # Default external libraries source download and build configuration
@@ -78,6 +72,17 @@ option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" ON)
 option(BUILD_RUY "Build ruy library from the downloaded source" ON)
 option(BUILD_CPUINFO "Build cpuinfo library from the downloaded source" ON)
 option(PROFILE_RUY "Enable ruy library profiling" OFF)
+option(DOWNLOAD_XNNPACK "Download xnnpack source" ON)
+option(BUILD_XNNPACK "Build xnnpack library from the downloaded source" ON)
+option(DOWNLOAD_PTHREADPOOL "Download pthreadpool source" ON)
+option(BUILD_PTHREADPOOL "Build pthreadpool library from the source" ON)
+option(DOWNLOAD_PSIMD "Download psimd source" ON)
+option(BUILD_PSIMD "Build psimd library from the source" ON)
+option(DOWNLOAD_FP16 "Download fp16 source" ON)
+option(BUILD_FP16 "Build fp16 library from the source" ON)
+option(DOWNLOAD_FXDIV "Download fxdiv source" ON)
+option(BUILD_FXDIV "Build fxdiv library from the source" ON)
+
 
 #
 ## Default sample build configuration
diff --git a/infra/nnfw/cmake/options/options_aarch64-android.cmake b/infra/nnfw/cmake/options/options_aarch64-android.cmake
index d8eceef..9332f52 100644
--- a/infra/nnfw/cmake/options/options_aarch64-android.cmake
+++ b/infra/nnfw/cmake/options/options_aarch64-android.cmake
@@ -3,15 +3,10 @@
 # NOTE BUILD_ANDROID_TFLITE(JNI lib) is disabled due to BuiltinOpResolver issue.
 # tensorflow-lite does not build BuiltinOpResolver but JNI lib need it
 # Related Issue : #1403
-option(BUILD_ANDROID_TFLITE "Enable android support for TensorFlow Lite" ON)
+option(BUILD_ANDROID_TFLITE "Enable android support for TensorFlow Lite" OFF)
 option(BUILD_ANDROID_BENCHMARK_APP "Enable Android Benchmark App" ON)
 option(DOWNLOAD_NEON2SSE "Download NEON2SSE library source" OFF)
 # Need boost library
 option(DOWNLOAD_BOOST "Download boost source" ON)
 option(BUILD_BOOST "Build boost source" ON)
-option(BUILD_RUNTIME_NNAPI_TEST "Build Runtime NN API Generated Test" OFF)
-option(BUILD_NNAPI_TEST "Build nnapi_test" OFF)
-option(BUILD_NNPACKAGE_RUN "Build nnpackge_run" ON)
-option(BUILD_TFLITE_RUN "Build tflite-run" ON)
-option(BUILD_TFLITE_LOADER_TEST_TOOL "Build tflite loader testing tool" OFF)
 option(BUILD_LOGGING "Build logging runtime" OFF)
diff --git a/infra/nnfw/cmake/options/options_x86_64-darwin.cmake b/infra/nnfw/cmake/options/options_x86_64-darwin.cmake
index 97642e6..135cfbf 100644
--- a/infra/nnfw/cmake/options/options_x86_64-darwin.cmake
+++ b/infra/nnfw/cmake/options/options_x86_64-darwin.cmake
@@ -3,3 +3,4 @@
 #
 option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" OFF)
 option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" OFF)
+option(BUILD_XNNPACK "Build XNNPACK" OFF)
diff --git a/infra/nnfw/cmake/options/options_x86_64-linux.cmake b/infra/nnfw/cmake/options/options_x86_64-linux.cmake
index 97642e6..135cfbf 100644
--- a/infra/nnfw/cmake/options/options_x86_64-linux.cmake
+++ b/infra/nnfw/cmake/options/options_x86_64-linux.cmake
@@ -3,3 +3,4 @@
 #
 option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" OFF)
 option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" OFF)
+option(BUILD_XNNPACK "Build XNNPACK" OFF)
diff --git a/infra/nnfw/cmake/options/options_x86_64-tizen.cmake b/infra/nnfw/cmake/options/options_x86_64-tizen.cmake
index bf8b280..1e83e4e 100644
--- a/infra/nnfw/cmake/options/options_x86_64-tizen.cmake
+++ b/infra/nnfw/cmake/options/options_x86_64-tizen.cmake
@@ -8,3 +8,5 @@ option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" OFF)
 option(BUILD_LOGGING "Build logging runtime" OFF)
 option(GENERATE_RUNTIME_NNAPI_TESTS "Generate NNAPI operation gtest" OFF)
 option(ENVVAR_ONERT_CONFIG "Use environment variable for onert configuration" OFF)
+
+option(BUILD_XNNPACK "Build XNNPACK" OFF)
diff --git a/infra/nnfw/cmake/packages/CpuInfoConfig.cmake b/infra/nnfw/cmake/packages/CpuInfoConfig.cmake
index 408cf85..99ee795 100644
--- a/infra/nnfw/cmake/packages/CpuInfoConfig.cmake
+++ b/infra/nnfw/cmake/packages/CpuInfoConfig.cmake
@@ -20,6 +20,8 @@ function(_CpuInfo_Build)
   set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "Build cpuinfo micro-benchmarks")
   add_extdirectory("${CpuInfoSource_DIR}" cpuinfo EXCLUDE_FROM_ALL)
   set_target_properties(cpuinfo PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  # Suppress warnings generated by clog
+  set_target_properties(clog PROPERTIES COMPILE_FLAGS "-Wno-unused-result")
   set(CpuInfoSource_DIR ${CpuInfoSource_DIR} PARENT_SCOPE)
   set(CpuInfo_FOUND TRUE PARENT_SCOPE)
 endfunction(_CpuInfo_Build)
diff --git a/infra/nnfw/cmake/packages/Fp16Config.cmake b/infra/nnfw/cmake/packages/Fp16Config.cmake
new file mode 100644
index 0000000..6c31613
--- /dev/null
+++ b/infra/nnfw/cmake/packages/Fp16Config.cmake
@@ -0,0 +1,30 @@
+function(_Fp16_Build)
+  nnas_find_package(Fp16Source QUIET)
+
+  # NOTE This line prevents multiple definitions of target
+  if(TARGET fp16)
+    set(Fp16Source_DIR ${Fp16Source_DIR} PARENT_SCOPE)
+    set(Fp16_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET fp16)
+
+  if(NOT Fp16Source_FOUND)
+    message(STATUS "FP16: Source not found")
+    set(Fp16_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT Fp16Source_FOUND)
+
+  set(FP16_BUILD_TESTS OFF CACHE BOOL "Build FP16 unit tests")
+  set(FP16_BUILD_BENCHMARKS OFF CACHE BOOL "Build FP16 micro-benchmarks")
+  nnas_find_package(PsimdSource)
+  set(PSIMD_SOURCE_DIR ${PsimdSource_DIR} CACHE STRING "String to disable download PSIMD on fp16")
+  add_extdirectory("${Fp16Source_DIR}" FP16 EXCLUDE_FROM_ALL)
+  set(Fp16Source_DIR ${Fp16Source_DIR} PARENT_SCOPE)
+  set(Fp16_FOUND TRUE PARENT_SCOPE)
+endfunction(_Fp16_Build)
+
+if(BUILD_FP16)
+  _Fp16_Build()
+else()
+  set(Fp16_FOUND FALSE)
+endif()
diff --git a/infra/nnfw/cmake/packages/FxdivConfig.cmake b/infra/nnfw/cmake/packages/FxdivConfig.cmake
new file mode 100644
index 0000000..6f268ae
--- /dev/null
+++ b/infra/nnfw/cmake/packages/FxdivConfig.cmake
@@ -0,0 +1,29 @@
+function(_Fxdiv_Build)
+  nnas_find_package(FxdivSource QUIET)
+
+  # NOTE This line prevents multiple definitions of target
+  if(TARGET fxdiv)
+    set(FxdivSource_DIR ${FxdivSource_DIR} PARENT_SCOPE)
+    set(Fxdiv_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET fxdiv)
+
+  if(NOT FxdivSource_FOUND)
+    message(STATUS "FXDIV: Source not found")
+    set(Fxdiv_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT FxdivSource_FOUND)
+
+  set(FXDIV_BUILD_TESTS OFF CACHE BOOL "Build FXdiv unit tests")
+  set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "Build FXdiv micro-benchmarks")
+
+  add_extdirectory("${FxdivSource_DIR}" FXDIV EXCLUDE_FROM_ALL)
+  set(FxdivSource_DIR ${FxdivSource_DIR} PARENT_SCOPE)
+  set(Fxdiv_FOUND TRUE PARENT_SCOPE)
+endfunction(_Fxdiv_Build)
+
+if(BUILD_FXDIV)
+  _Fxdiv_Build()
+else()
+  set(Fxdiv_FOUND FALSE)
+endif()
diff --git a/infra/nnfw/cmake/packages/PsimdConfig.cmake b/infra/nnfw/cmake/packages/PsimdConfig.cmake
new file mode 100644
index 0000000..a3587b6
--- /dev/null
+++ b/infra/nnfw/cmake/packages/PsimdConfig.cmake
@@ -0,0 +1,26 @@
+function(_Psimd_Build)
+  nnas_find_package(PsimdSource QUIET)
+
+  # NOTE This line prevents multiple definitions of target
+  if(TARGET psimd)
+    set(PsimdSource_DIR ${PsimdSource_DIR} PARENT_SCOPE)
+    set(Psimd_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET psimd)
+
+  if(NOT PsimdSource_FOUND)
+    message(STATUS "PSIMD: Source not found")
+    set(Psimd_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT PsimdSource_FOUND)
+
+  add_extdirectory("${PsimdSource_DIR}" PSIMD EXCLUDE_FROM_ALL)
+  set(PsimdSource_DIR ${PsimdSource_DIR} PARENT_SCOPE)
+  set(Psimd_FOUND TRUE PARENT_SCOPE)
+endfunction(_Psimd_Build)
+
+if(BUILD_PSIMD)
+  _Psimd_Build()
+else()
+  set(Psimd_FOUND FALSE)
+endif()
diff --git a/infra/nnfw/cmake/packages/PthreadpoolConfig.cmake b/infra/nnfw/cmake/packages/PthreadpoolConfig.cmake
new file mode 100644
index 0000000..6283826
--- /dev/null
+++ b/infra/nnfw/cmake/packages/PthreadpoolConfig.cmake
@@ -0,0 +1,35 @@
+function(_Pthreadpool_Build)
+  nnas_find_package(PthreadpoolSource QUIET)
+
+  # NOTE This line prevents multiple definitions of target
+  if(TARGET pthreadpool)
+    set(PthreadpoolSource_DIR ${PthreadpoolSource_DIR} PARENT_SCOPE)
+    set(Pthreadpool_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET pthreadpool)
+
+  if(NOT PthreadpoolSource_FOUND)
+    message(STATUS "PTHREADPOOL: Source not found")
+    set(Pthreadpool_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT PthreadpoolSource_FOUND)
+
+  SET(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "Build pthreadpool unit tests")
+  SET(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "Build pthreadpool micro-benchmarks")
+
+  nnas_find_package(FxdivSource)
+  set(FXDIV_SOURCE_DIR ${FxdivSource_DIR} CACHE STRING "String to disable download FXDIV")
+
+  add_extdirectory("${PthreadpoolSource_DIR}" PTHREADPOOL EXCLUDE_FROM_ALL)
+  set_target_properties(pthreadpool PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  # Suppress warnings generated by pthreadpool
+  set_target_properties(pthreadpool PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+  set(PthreadpoolSource_DIR ${PthreadpoolSource_DIR} PARENT_SCOPE)
+  set(Pthreadpool_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pthreadpool_Build)
+
+if(BUILD_PTHREADPOOL)
+  _Pthreadpool_Build()
+else()
+  set(Pthreadpool_FOUND FALSE)
+endif()
diff --git a/infra/nnfw/cmake/packages/XnnpackConfig.cmake b/infra/nnfw/cmake/packages/XnnpackConfig.cmake
new file mode 100644
index 0000000..191a28f
--- /dev/null
+++ b/infra/nnfw/cmake/packages/XnnpackConfig.cmake
@@ -0,0 +1,38 @@
+function(_Xnnpack_Build)
+  nnas_find_package(XnnpackSource QUIET)
+  nnfw_find_package(Fxdiv QUIET)
+  nnfw_find_package(CpuInfo QUIET)
+  nnfw_find_package(Pthreadpool QUIET)
+  nnfw_find_package(Psimd QUIET)
+  nnfw_find_package(Fp16 QUIET)
+
+  # NOTE This line prevents multiple definitions of cpuinfo target
+  if(TARGET XNNPACK)
+    set(XnnpackSource_DIR ${XnnpackSource_DIR} PARENT_SCOPE)
+    set(Xnnpack_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET XNNPACK)
+
+  if(NOT XnnpackSource_FOUND)
+    message(STATUS "XNNPACK: Source not found")
+    set(Xnnpack_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT XnnpackSource_FOUND)
+
+  set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "Build XNNPACK unit tests")
+  set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "Build XNNPACK benchmarks")
+  set(XNNPACK_USE_SYSTEM_LIBS ON CACHE BOOL "Use system-provided dependency libraries")
+
+  add_extdirectory("${XnnpackSource_DIR}" XNNPACK EXCLUDE_FROM_ALL)
+  set_target_properties(XNNPACK PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  # Suppress warnings generated by xnnpack
+  set_target_properties(XNNPACK PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+  set(XnnpackSource_DIR ${XnnpackSource_DIR} PARENT_SCOPE)
+  set(Xnnpack_FOUND TRUE PARENT_SCOPE)
+endfunction(_Xnnpack_Build)
+
+if(BUILD_XNNPACK)
+  _Xnnpack_Build()
+else(BUILD_XNNPACK)
+  set(Xnnpack_FOUND FALSE)
+endif(BUILD_XNNPACK)
diff --git a/infra/nnfw/command/build b/infra/nnfw/command/build
index b0301d2..4a3601e 100644
--- a/infra/nnfw/command/build
+++ b/infra/nnfw/command/build
@@ -8,4 +8,4 @@ if [[ ! -d "${BUILD_PATH}" ]]; then
 fi
 
 cd ${BUILD_PATH}
-make "$@"
+cmake --build . -- "$@"
diff --git a/infra/scripts/build_android_runtime_release.sh b/infra/scripts/build_android_runtime_release.sh
index fe933c6..c9a3b1b 100755
--- a/infra/scripts/build_android_runtime_release.sh
+++ b/infra/scripts/build_android_runtime_release.sh
@@ -18,4 +18,5 @@ fi
 
 export TARGET_OS=android
 export CROSS_BUILD=1
-make -f Makefile.template
+export BUILD_TYPE=release
+make -f Makefile.template install
diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
index 011d14c..607526b 100755
--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
+++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
@@ -22,6 +22,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
index 551fb57..07b5ca4 100755
--- a/infra/scripts/docker_build_cross_arm_runtime.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime.sh
@@ -22,6 +22,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
index 876f318..8d04438 100755
--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
@@ -22,6 +22,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
index f42251b..e03ea75 100755
--- a/infra/scripts/docker_build_cross_coverage.sh
+++ b/infra/scripts/docker_build_cross_coverage.sh
@@ -22,6 +22,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index 5fd49a4..e65feb5 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -35,6 +35,8 @@ if [ -d $ONNXRUNTIME_PREFIX ]; then
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_test_x64.sh b/infra/scripts/docker_build_test_x64.sh
index 16fcf3f..0d2395b 100755
--- a/infra/scripts/docker_build_test_x64.sh
+++ b/infra/scripts/docker_build_test_x64.sh
@@ -14,6 +14,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
index ee0f183..9a8378f 100755
--- a/infra/scripts/docker_build_tizen_cross.sh
+++ b/infra/scripts/docker_build_tizen_cross.sh
@@ -22,6 +22,8 @@ else
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index 55adaa1..ef6212a 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -40,6 +40,8 @@ if [ -d $ONNXRUNTIME_PREFIX ]; then
 fi
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_coverage_report.sh b/infra/scripts/docker_coverage_report.sh
index 677462d..f0de1de 100755
--- a/infra/scripts/docker_coverage_report.sh
+++ b/infra/scripts/docker_coverage_report.sh
@@ -8,6 +8,8 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # docker image name
+# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
+# - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/test_ubuntu_runtime_mixed.sh b/infra/scripts/test_ubuntu_runtime_mixed.sh
index 40f59eb..6eab90c 100755
--- a/infra/scripts/test_ubuntu_runtime_mixed.sh
+++ b/infra/scripts/test_ubuntu_runtime_mixed.sh
@@ -58,5 +58,6 @@ export OP_BACKEND_Conv2D="cpu"
 export OP_BACKEND_MaxPool2D="acl_cl"
 export OP_BACKEND_AvgPool2D="acl_neon"
 export ACL_LAYOUT="NCHW"
+export RUY_THREADS=4
 NNAPIGTest "acl_cl;acl_neon;cpu" "Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}.union" "report/mixed"
 TFLiteModelVerification "acl_cl;acl_neon;cpu" "${TESTLIST_PREFIX}.intersect.txt" "report/mixed"
diff --git a/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST b/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST
index 1d96cce..3ed12f9 100644
--- a/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST
+++ b/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST
@@ -1,7 +1,8 @@
 {
   "major-version" : "1",
-  "minor-version" : "0",
+  "minor-version" : "1",
   "patch-version" : "0",
+  "configs"     : [ "config.cfg" ],
   "models"      : [ "add.tflite" ],
   "model-types" : [ "tflite" ]
 }
diff --git a/nnpackage/examples/one_op_in_tflite/metadata/config.cfg b/nnpackage/examples/one_op_in_tflite/metadata/config.cfg
new file mode 100644
index 0000000..776fa70
--- /dev/null
+++ b/nnpackage/examples/one_op_in_tflite/metadata/config.cfg
@@ -0,0 +1 @@
+BACKENDS="cpu"
diff --git a/nnpackage/spec/10_packaging_and_manifest.md b/nnpackage/spec/10_packaging_and_manifest.md
index d4e6ec8..4dc3de8 100644
--- a/nnpackage/spec/10_packaging_and_manifest.md
+++ b/nnpackage/spec/10_packaging_and_manifest.md
@@ -18,11 +18,13 @@ For `model` and `custom_op`, see [20_model_and_operators.md](20_model_and_operat
 nnpackage
 âââ custom_op
 âââ metadata
-âÂ Â  âââ MANIFEST
+âÂ Â  âââ MANIFEST
+âÂ Â  âââ config.cfg
 âââ mymodel.model
 ```
 
 - `mymodel.model` is a model file that has computation graph and weights.
+- `config.cfg` is a configuration file that has parameters to configure onert.
 - `metadata` is a directory that contains all metadata including `MANIFEST`.
 - `MANIFEST` is a collection of attributes about this package.
 - `custom_op` is a directory that contains implementation objects.
@@ -61,6 +63,11 @@ For detail, see [semantic versioning 2.0.0](https://semver.org/)
 
 `patch-version` is the patch version of `nnpackage`.
 
+#### configs
+
+`configs` is an array of configuration file names placed in `metadata` folder. This can be empty or
+attribute itself can be omitted. As of now we only support only one item.
+
 #### models
 
 `models` is an array of path to model files, which is relative path from top level directory of this package.
@@ -84,9 +91,25 @@ Here is an example of `MANIFEST`.
 ```
 {
     "major-version" : "1",
-    "minor-version" : "0",
+    "minor-version" : "1",
     "patch-version" : "0",
+    "configs"     : [ "model.cfg" ],
     "models"      : [ "mymodel.model", "yourmodel.model" ],
     "model-types" : [ "tflite", "circle" ]
 }
 ```
+
+## 5. Configuration file
+
+Configuration file is a human readable plain text file having one `key=value` in each line.
+- `#` is used as comment and will be ignored afterwards.
+- all leading and trailing white spaces will be ignored in both `key` and `value`.
+
+For example
+```
+BACKENDS=cpu
+# leading/trailing space is ignored
+ EXCUTOR=Linear # some comment
+```
+
+Refer `runtime/onert/core/include/util/Config.lst` file for more information of `key`.
diff --git a/packaging/FP16.tar.gz b/packaging/FP16.tar.gz
new file mode 100644
index 0000000..ebd2764
Binary files /dev/null and b/packaging/FP16.tar.gz differ
diff --git a/packaging/FXDIV.tar.gz b/packaging/FXDIV.tar.gz
new file mode 100644
index 0000000..7c1b825
Binary files /dev/null and b/packaging/FXDIV.tar.gz differ
diff --git a/packaging/PSIMD.tar.gz b/packaging/PSIMD.tar.gz
new file mode 100644
index 0000000..3ae8924
Binary files /dev/null and b/packaging/PSIMD.tar.gz differ
diff --git a/packaging/PTHREADPOOL.tar.gz b/packaging/PTHREADPOOL.tar.gz
new file mode 100644
index 0000000..6cf42c0
Binary files /dev/null and b/packaging/PTHREADPOOL.tar.gz differ
diff --git a/packaging/XNNPACK.tar.gz b/packaging/XNNPACK.tar.gz
new file mode 100644
index 0000000..d770c2c
Binary files /dev/null and b/packaging/XNNPACK.tar.gz differ
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index 18150f3..028d88b 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -1,6 +1,6 @@
 Name:    nnfw
 Summary: nnfw
-Version: 1.11.1
+Version: 1.12.0
 Release: 1
 Group:   Development
 License: Apache-2.0 and MIT and BSD-2-Clause
@@ -13,6 +13,11 @@ Source1003: eigen.tar.gz
 Source1004: gemmlowp.tar.gz
 Source1005: ruy.tar.gz
 Source1006: cpuinfo.tar.gz
+Source1007: XNNPACK.tar.gz
+Source1008: FXDIV.tar.gz
+Source1009: PTHREADPOOL.tar.gz
+Source1010: PSIMD.tar.gz
+Source1011: FP16.tar.gz
 Source2001: nnfw.pc.in
 Source2002: nnfw-plugin.pc.in
 
@@ -116,6 +121,11 @@ tar -xf %{SOURCE1003} -C ./externals
 tar -xf %{SOURCE1004} -C ./externals
 tar -xf %{SOURCE1005} -C ./externals
 tar -xf %{SOURCE1006} -C ./externals
+tar -xf %{SOURCE1007} -C ./externals
+tar -xf %{SOURCE1008} -C ./externals
+tar -xf %{SOURCE1009} -C ./externals
+tar -xf %{SOURCE1010} -C ./externals
+tar -xf %{SOURCE1011} -C ./externals
 
 %build
 %ifarch arm armv7l aarch64 x86_64
diff --git a/res/CircleRecipes/InstanceNorm_001/test.recipe b/res/CircleRecipes/InstanceNorm_001/test.recipe
new file mode 100644
index 0000000..ec647c3
--- /dev/null
+++ b/res/CircleRecipes/InstanceNorm_001/test.recipe
@@ -0,0 +1,47 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+}
+operand {
+  name: "gamma"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "0.0123"
+    arg: "-0.3324"
+    arg: "0.2324"
+    arg: "-3.3360"
+  }
+}
+operand {
+  name: "beta"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "0.7023"
+    arg: "-0.3092"
+    arg: "0.7552"
+    arg: "0.2729"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+}
+operation {
+  type: "InstanceNorm"
+  input: "ifm"
+  input: "gamma"
+  input: "beta"
+  output: "ofm"
+  instance_norm_options {
+    epsilon: 0.001
+    activation: NONE
+  }
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/CircleRecipes/InstanceNorm_001/test.reverse b/res/CircleRecipes/InstanceNorm_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Mean_U8_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/Mean_U8_dynamic_000/test.recipe
new file mode 100644
index 0000000..bed2563
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Mean_U8_dynamic_000/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 8 dim: 8 dim: 4 }
+  quant { min: -128 max: 127 scale: 1 zero_point: 128 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 2 }
+  filler { tag: "explicit" arg: "1" arg: "2" }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 1 dim: 1 dim: 4 }
+  quant { min: -256 max: 254 scale: 2 zero_point: 128 }
+  shape_signature { dim: -1 dim: 1 dim: 1 dim: 4 }
+}
+operation {
+  type: "Mean"
+  mean_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Mean_U8_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/Mean_U8_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Mean_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/Mean_dynamic_000/test.recipe
new file mode 100644
index 0000000..a098c62
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Mean_dynamic_000/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 4 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "-1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 1 }
+}
+operation {
+  type: "Mean"
+  mean_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Mean_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/Mean_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Mean_dynamic_001/test.recipe b/res/TensorFlowLiteRecipes/Mean_dynamic_001/test.recipe
new file mode 100644
index 0000000..bd1a462
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Mean_dynamic_001/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+  shape_signature { dim: -1 dim: 4 }
+}
+operation {
+  type: "Mean"
+  mean_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Mean_dynamic_001/test.reverse b/res/TensorFlowLiteRecipes/Mean_dynamic_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReLU6_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReLU6_dynamic_000/test.recipe
new file mode 100644
index 0000000..e6dee0e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReLU6_dynamic_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "ReLU6"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReLU6_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReLU6_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReLUN1To1_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReLUN1To1_dynamic_000/test.recipe
new file mode 100644
index 0000000..21c237f
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReLUN1To1_dynamic_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "ReLUN1To1"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReLUN1To1_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReLUN1To1_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReLU_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReLU_dynamic_000/test.recipe
new file mode 100644
index 0000000..fa4293e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReLU_dynamic_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  shape_signature { dim: -1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "ReLU"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReLU_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReLU_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_000/test.recipe
new file mode 100644
index 0000000..427bd05
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_000/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: BOOL
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "1" arg: "2"
+  }
+}
+operand {
+  name: "ofm"
+  type: BOOL
+  shape { }
+}
+operation {
+  type: "ReduceAny"
+  reduce_any_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_001/test.recipe b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_001/test.recipe
new file mode 100644
index 0000000..9c3a5e8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_001/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm"
+  type: BOOL
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler {
+    tag: "explicit"
+    arg: "1"
+  }
+}
+operand {
+  name: "ofm"
+  type: BOOL
+  shape { dim: 1 dim: 4 }
+  shape_signature { dim: -1 dim: 4 }
+}
+operation {
+  type: "ReduceAny"
+  reduce_any_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_001/test.reverse b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_002/test.recipe b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_002/test.recipe
new file mode 100644
index 0000000..109a3cb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_002/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: BOOL
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "1" arg: "2"
+  }
+}
+operand {
+  name: "ofm"
+  type: BOOL
+  shape { dim: 1 dim: 1 dim: 1 }
+}
+operation {
+  type: "ReduceAny"
+  reduce_any_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_002/test.reverse b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_002/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_003/test.recipe b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_003/test.recipe
new file mode 100644
index 0000000..1355f2b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_003/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: BOOL
+  shape { dim: 2 dim: 1 dim: 4 }
+  shape_signature { dim: 2 dim: -1 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler {
+    tag: "explicit"
+    arg: "1"
+  }
+}
+operand {
+  name: "ofm"
+  type: BOOL
+  shape { dim: 2 dim: 1 dim: 4 }
+}
+operation {
+  type: "ReduceAny"
+  reduce_any_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceAny_dynamic_003/test.reverse b/res/TensorFlowLiteRecipes/ReduceAny_dynamic_003/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceMax_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReduceMax_dynamic_000/test.recipe
new file mode 100644
index 0000000..01669be
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceMax_dynamic_000/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 4 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 4 }
+}
+operand {
+  name: "axis"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "-1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 1 }
+}
+operation {
+  type: "ReduceMax"
+  reduce_max_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "axis"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceMax_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReduceMax_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceMin_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReduceMin_dynamic_000/test.recipe
new file mode 100644
index 0000000..50603ba
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceMin_dynamic_000/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 4 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 4 }
+}
+operand {
+  name: "axis"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "-1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 1 }
+}
+operation {
+  type: "ReduceMin"
+  reduce_min_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "axis"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceMin_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReduceMin_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_000/test.recipe
new file mode 100644
index 0000000..e81db67
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_000/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "1" arg: "2"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { }
+}
+operation {
+  type: "ReduceProd"
+  reduce_prod_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_001/test.recipe b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_001/test.recipe
new file mode 100644
index 0000000..f2811b3
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_001/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler {
+    tag: "explicit"
+    arg: "1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+  shape_signature { dim: -1 dim: 4 }
+}
+operation {
+  type: "ReduceProd"
+  reduce_prod_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_001/test.reverse b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_002/test.recipe b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_002/test.recipe
new file mode 100644
index 0000000..c1e14c5
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_002/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "1" arg: "2"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 }
+}
+operation {
+  type: "ReduceProd"
+  reduce_prod_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_002/test.reverse b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_002/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_003/test.recipe b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_003/test.recipe
new file mode 100644
index 0000000..4e4633f
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_003/test.recipe
@@ -0,0 +1,31 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 2 dim: 1 dim: 4 }
+  shape_signature { dim: 2 dim: -1 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler {
+    tag: "explicit"
+    arg: "1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 2 dim: 1 dim: 4 }
+}
+operation {
+  type: "ReduceProd"
+  reduce_prod_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ReduceProd_dynamic_003/test.reverse b/res/TensorFlowLiteRecipes/ReduceProd_dynamic_003/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Sum_dynamic_000/test.recipe b/res/TensorFlowLiteRecipes/Sum_dynamic_000/test.recipe
new file mode 100644
index 0000000..99b089e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Sum_dynamic_000/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 4 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "-1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  shape_signature { dim: -1 dim: 8 dim: 8 dim: 1 }
+}
+operation {
+  type: "Sum"
+  sum_options {
+    keep_dims: true
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Sum_dynamic_000/test.reverse b/res/TensorFlowLiteRecipes/Sum_dynamic_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Sum_dynamic_001/test.recipe b/res/TensorFlowLiteRecipes/Sum_dynamic_001/test.recipe
new file mode 100644
index 0000000..46fac49
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Sum_dynamic_001/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 4 }
+  shape_signature { dim: -1 dim: 3 dim: 4 }
+}
+operand {
+  name: "reduction_indices"
+  type: INT32
+  shape { dim: 1 }
+  filler { tag: "explicit" arg: "1" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+  shape_signature { dim: -1 dim: 4 }
+}
+operation {
+  type: "Sum"
+  sum_options {
+    keep_dims: false
+  }
+  input: "ifm"
+  input: "reduction_indices"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/runtime/contrib/.clang-format b/runtime/contrib/.clang-format
new file mode 120000
index 0000000..f761fe4
--- /dev/null
+++ b/runtime/contrib/.clang-format
@@ -0,0 +1 @@
+../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_log.h b/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_log.h
index 69dfcc7..3d71f89 100644
--- a/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_log.h
+++ b/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_log.h
@@ -47,12 +47,11 @@ extern "C" {
     }                                                 \
   } while (0)
 #else // __TIZEN__
-#define LEVEL_TO_STR(level)   \
-  (((level) == ERROR)         \
-       ? "ERROR"              \
-       : ((level) == WARNING) \
-             ? "WARNING"      \
-             : ((level) == INFO) ? "INFO" : ((level) == DEBUG) ? "DEBUG" : "DEFAULT")
+#define LEVEL_TO_STR(level)                  \
+  (((level) == ERROR) ? "ERROR"              \
+                      : ((level) == WARNING) \
+                          ? "WARNING"        \
+                          : ((level) == INFO) ? "INFO" : ((level) == DEBUG) ? "DEBUG" : "DEFAULT")
 #define TFLITE_NATIVE_LOG(log_level, format, args...)      \
   do                                                       \
   {                                                        \
diff --git a/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_nativewrapper.h b/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_nativewrapper.h
index b099ba9..2fb98cc 100644
--- a/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_nativewrapper.h
+++ b/runtime/contrib/TFLiteSharp/TFLiteNative/include/tflite_nativewrapper.h
@@ -26,7 +26,8 @@
 extern "C" {
 #endif /*__cplusplus*/
 
-typedef enum {
+typedef enum
+{
   /** 32-bit signed integer. */
   INT32 = 1,
 
diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk
index 7d9f565..c00c7d3 100644
--- a/runtime/contrib/android/api/Prebuilt.mk
+++ b/runtime/contrib/android/api/Prebuilt.mk
@@ -21,14 +21,6 @@ LOCAL_SRC_FILES := \
 		$(ONERT_PREBUILT_LIB_DIR)/libtflite_loader.so
 include $(PREBUILT_SHARED_LIBRARY)
 
-# libtensorflowlite_jni
-include $(CLEAR_VARS)
-LOCAL_MODULE := tensorflowlite_jni
-PREBUILT_LIB += tensorflowlite_jni
-LOCAL_SRC_FILES := \
-		$(ONERT_PREBUILT_LIB_DIR)/libtensorflowlite_jni.so
-include $(PREBUILT_SHARED_LIBRARY)
-
 # libnnfw
 include $(CLEAR_VARS)
 LOCAL_MODULE := nnfw-dev
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index d383b2d..6bb7a56 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,7 +8,7 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.11.1"
+        versionName "1.12.0"
 
         externalNativeBuild {
             ndkBuild {
diff --git a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
index 209264d..72e73be 100644
--- a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
+++ b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
@@ -52,7 +52,7 @@ JNIEXPORT void JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeCloseSe
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeLoadModelFromFile(
-    JNIEnv *env, jobject, jlong handle, jstring jnnpkg_path)
+  JNIEnv *env, jobject, jlong handle, jstring jnnpkg_path)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -103,7 +103,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeRun
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetInput(
-    JNIEnv *env, jobject, jlong handle, jint jindex, jint jtype, jobject jbuf, jint jbufsize)
+  JNIEnv *env, jobject, jlong handle, jint jindex, jint jtype, jobject jbuf, jint jbufsize)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -129,7 +129,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetOutput(
-    JNIEnv *env, jobject, jlong handle, jint jindex, jint jtype, jobject jbuf, jint jbufsize)
+  JNIEnv *env, jobject, jlong handle, jint jindex, jint jtype, jobject jbuf, jint jbufsize)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -156,7 +156,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetInputLayout(
-    JNIEnv *, jobject, jlong handle, jint jindex, jint jlayout)
+  JNIEnv *, jobject, jlong handle, jint jindex, jint jlayout)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -178,7 +178,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetOutputLayout(
-    JNIEnv *, jobject, jlong handle, jint jindex, jint jlayout)
+  JNIEnv *, jobject, jlong handle, jint jindex, jint jlayout)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -234,7 +234,7 @@ JNIEXPORT jint JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetOutp
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetAvailableBackends(
-    JNIEnv *env, jobject, jlong handle, jstring jbackends)
+  JNIEnv *env, jobject, jlong handle, jstring jbackends)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -255,7 +255,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetInputTensorInfo(
-    JNIEnv *env, jobject, jlong handle, jint jindex, jobject jinfo)
+  JNIEnv *env, jobject, jlong handle, jint jindex, jobject jinfo)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
@@ -277,7 +277,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGet
 }
 
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetOutputTensorInfo(
-    JNIEnv *env, jobject, jlong handle, jint jindex, jobject jinfo)
+  JNIEnv *env, jobject, jlong handle, jint jindex, jobject jinfo)
 {
   if (jni_helper::verifyHandle(handle) == JNI_FALSE)
     return JNI_FALSE;
diff --git a/runtime/contrib/android/api/src/main/native/onert-native-api.h b/runtime/contrib/android/api/src/main/native/onert-native-api.h
index 13768d4..7997530 100644
--- a/runtime/contrib/android/api/src/main/native/onert-native-api.h
+++ b/runtime/contrib/android/api/src/main/native/onert-native-api.h
@@ -46,7 +46,7 @@ JNIEXPORT void JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeCloseSe
  * Signature: (JLjava/lang/String;)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeLoadModelFromFile(
-    JNIEnv *, jobject, jlong, jstring);
+  JNIEnv *, jobject, jlong, jstring);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -71,7 +71,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeRun
  * Signature: (JIILjava/nio/ByteBuffer;I)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetInput(
-    JNIEnv *, jobject, jlong, jint, jint, jobject, jint);
+  JNIEnv *, jobject, jlong, jint, jint, jobject, jint);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -79,7 +79,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
  * Signature: (JIILjava/nio/ByteBuffer;I)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetOutput(
-    JNIEnv *, jobject, jlong, jint, jint, jobject, jint);
+  JNIEnv *, jobject, jlong, jint, jint, jobject, jint);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -87,7 +87,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
  * Signature: (JII)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetInputLayout(
-    JNIEnv *, jobject, jlong, jint, jint);
+  JNIEnv *, jobject, jlong, jint, jint);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -95,7 +95,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
  * Signature: (JII)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetOutputLayout(
-    JNIEnv *, jobject, jlong, jint, jint);
+  JNIEnv *, jobject, jlong, jint, jint);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -121,7 +121,7 @@ JNIEXPORT jint JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetOutp
  * Signature: (JILcom/samsung/onert/NativeSessionWrapper/InternalTensorInfo;)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetInputTensorInfo(
-    JNIEnv *, jobject, jlong, jint, jobject);
+  JNIEnv *, jobject, jlong, jint, jobject);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -129,7 +129,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGet
  * Signature: (JILcom/samsung/onert/NativeSessionWrapper/InternalTensorInfo;)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGetOutputTensorInfo(
-    JNIEnv *, jobject, jlong, jint, jobject);
+  JNIEnv *, jobject, jlong, jint, jobject);
 
 /*
  * Class:     com_samsung_onert_NativeSessionWrapper
@@ -137,7 +137,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeGet
  * Signature: (JLjava/lang/String;)Z
  */
 JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSetAvailableBackends(
-    JNIEnv *, jobject, jlong, jstring);
+  JNIEnv *, jobject, jlong, jstring);
 
 #ifdef __cplusplus
 }
diff --git a/runtime/contrib/android_benchmark_app/cpp/ndk_main.cpp b/runtime/contrib/android_benchmark_app/cpp/ndk_main.cpp
index 4b0e439..8df179a 100644
--- a/runtime/contrib/android_benchmark_app/cpp/ndk_main.cpp
+++ b/runtime/contrib/android_benchmark_app/cpp/ndk_main.cpp
@@ -173,7 +173,7 @@ inline void runBenchmark(JNIEnv *env, jobject thisObj, Activity &act)
 }
 
 JNIEXPORT void JNICALL Java_com_ndk_tflbench_MainActivity_runInterpreterBenchmark(
-    JNIEnv *env, jobject thisObj, jobject model_buffer)
+  JNIEnv *env, jobject thisObj, jobject model_buffer)
 {
   setTitle(env, thisObj, "Running Interpreter Benchmark");
 
diff --git a/runtime/contrib/android_tflite/builtin_ops_jni.cc b/runtime/contrib/android_tflite/builtin_ops_jni.cc
index 5770701..597f11a 100644
--- a/runtime/contrib/android_tflite/builtin_ops_jni.cc
+++ b/runtime/contrib/android_tflite/builtin_ops_jni.cc
@@ -24,7 +24,7 @@ namespace tflite
 std::unique_ptr<OpResolver> CreateOpResolver()
 {
   return std::unique_ptr<::nnfw::tflite::BuiltinOpResolver>(
-      new ::nnfw::tflite::BuiltinOpResolver());
+    new ::nnfw::tflite::BuiltinOpResolver());
 }
 
 } // namespace tflite
diff --git a/runtime/contrib/heap_trace/src/cl_create_buffer_stub.cc b/runtime/contrib/heap_trace/src/cl_create_buffer_stub.cc
index d9d2700..2affbe0 100644
--- a/runtime/contrib/heap_trace/src/cl_create_buffer_stub.cc
+++ b/runtime/contrib/heap_trace/src/cl_create_buffer_stub.cc
@@ -31,8 +31,8 @@ cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void
   static auto isOriginalFunctionCallSuccessful = [](cl_mem result) -> bool { return result; };
 
   static auto originalFunction =
-      findFunctionByName<cl_mem, cl_context, cl_mem_flags, size_t, void *, cl_int *>(
-          "clCreateBuffer");
+    findFunctionByName<cl_mem, cl_context, cl_mem_flags, size_t, void *, cl_int *>(
+      "clCreateBuffer");
   cl_mem result = originalFunction(context, flags, size, host_ptr, errcode_ret);
   if (isOriginalFunctionCallSuccessful(result) && !Trace::Guard{}.isActive())
   {
diff --git a/runtime/contrib/heap_trace/src/memory_pool_for_symbol_searcher_internals.h b/runtime/contrib/heap_trace/src/memory_pool_for_symbol_searcher_internals.h
index 89797ad..3186c7f 100644
--- a/runtime/contrib/heap_trace/src/memory_pool_for_symbol_searcher_internals.h
+++ b/runtime/contrib/heap_trace/src/memory_pool_for_symbol_searcher_internals.h
@@ -60,7 +60,7 @@ private:
   {
     uint8_t *ptr_to_the_free_space_after_allocation = _ptr_to_free_space_start + size;
     size_t size_of_reserved_space_after_allocation =
-        ptr_to_the_free_space_after_allocation - _buffer;
+      ptr_to_the_free_space_after_allocation - _buffer;
     if (size_of_reserved_space_after_allocation >= MAX_SIZE)
     {
       return false;
diff --git a/runtime/contrib/heap_trace/src/trace.cc b/runtime/contrib/heap_trace/src/trace.cc
index 020aeb9..39a0c46 100644
--- a/runtime/contrib/heap_trace/src/trace.cc
+++ b/runtime/contrib/heap_trace/src/trace.cc
@@ -72,7 +72,7 @@ void Trace::logAllocationEvent(cl_mem memory_ptr, size_t size_of_allocated_space
   if (found_memory_space_description == _memory_in_use_on_gpu.end())
   {
     _memory_in_use_on_gpu.insert(
-        std::make_pair(memory_ptr, MemoryTraits(1, size_of_allocated_space_in_bytes)));
+      std::make_pair(memory_ptr, MemoryTraits(1, size_of_allocated_space_in_bytes)));
     _total_allocated_bytes_on_gpu += size_of_allocated_space_in_bytes;
     if (_peak_heap_usage_on_gpu < _total_allocated_bytes_on_gpu - _total_deallocated_bytes_on_gpu)
     {
diff --git a/runtime/contrib/heap_trace/src/trace.h b/runtime/contrib/heap_trace/src/trace.h
index 647c51d..33e67e5 100644
--- a/runtime/contrib/heap_trace/src/trace.h
+++ b/runtime/contrib/heap_trace/src/trace.h
@@ -31,7 +31,7 @@ class Trace
     size_t size;
 
     MemoryTraits(size_t init_counter_value, size_t size_of_allocated_memory)
-        : ref_counter(init_counter_value), size(size_of_allocated_memory)
+      : ref_counter(init_counter_value), size(size_of_allocated_memory)
     {
     }
   };
diff --git a/runtime/contrib/heap_trace/tests/src/cl_release_mem_object_interception_test.cc b/runtime/contrib/heap_trace/tests/src/cl_release_mem_object_interception_test.cc
index 49b8fd9..a5700b2 100644
--- a/runtime/contrib/heap_trace/tests/src/cl_release_mem_object_interception_test.cc
+++ b/runtime/contrib/heap_trace/tests/src/cl_release_mem_object_interception_test.cc
@@ -94,9 +94,9 @@ TEST_F(ClReleaseMemObjectStub, must_log_deallocation_event_only_if_reference_cou
   clReleaseMemObject(mem);
   GlobalTrace.reset();
   ASSERT_STREQ(
-      getContentOfFile("./cl_release_mem_object_interception_test.log").c_str(),
-      "On CPU - Peak heap usage: 0 B, Total allocated: 0 B, Total deallocated: 0 B\nOn "
-      "GPU - Peak mem usage: 1024 B, Total allocated: 1024 B, Total deallocated: 1024 B\n");
+    getContentOfFile("./cl_release_mem_object_interception_test.log").c_str(),
+    "On CPU - Peak heap usage: 0 B, Total allocated: 0 B, Total deallocated: 0 B\nOn "
+    "GPU - Peak mem usage: 1024 B, Total allocated: 1024 B, Total deallocated: 1024 B\n");
 }
 
 TEST_F(ClReleaseMemObjectStub, must_not_log_deallocation_event_if_original_function_failed)
diff --git a/runtime/contrib/heap_trace/tests/src/malloc_interception_test.cc b/runtime/contrib/heap_trace/tests/src/malloc_interception_test.cc
index ea3eb82..182f52c 100644
--- a/runtime/contrib/heap_trace/tests/src/malloc_interception_test.cc
+++ b/runtime/contrib/heap_trace/tests/src/malloc_interception_test.cc
@@ -87,8 +87,8 @@ TEST_F(MallocStub, should_allocate_memory_from_pool_for_symbol_searcher_internal
 }
 
 TEST_F(
-    MallocStub,
-    should_not_influence_on_trace_results_even_if_orignal_function_return_any_not_null_ptr_when_incoming_size_is_zero)
+  MallocStub,
+  should_not_influence_on_trace_results_even_if_orignal_function_return_any_not_null_ptr_when_incoming_size_is_zero)
 {
   void *p = malloc(0);
   free(p);
diff --git a/runtime/contrib/heap_trace/tests/src/realloc_interception_test.cc b/runtime/contrib/heap_trace/tests/src/realloc_interception_test.cc
index 59660fa..e81c5dc 100644
--- a/runtime/contrib/heap_trace/tests/src/realloc_interception_test.cc
+++ b/runtime/contrib/heap_trace/tests/src/realloc_interception_test.cc
@@ -86,16 +86,16 @@ TEST_F(ReallocStub, should_work_as_malloc_when_incoming_ptr_is_equal_to_nullptr)
 
   ASSERT_TRUE(p);
   ASSERT_STREQ(
-      getContentOfFile("./realloc_interception_test.log").c_str(),
-      "On CPU - Peak heap usage: 1024 B, Total allocated: 1024 B, Total deallocated: 0 B\nOn "
-      "GPU - Peak mem usage: 0 B, Total allocated: 0 B, Total deallocated: 0 B\n");
+    getContentOfFile("./realloc_interception_test.log").c_str(),
+    "On CPU - Peak heap usage: 1024 B, Total allocated: 1024 B, Total deallocated: 0 B\nOn "
+    "GPU - Peak mem usage: 0 B, Total allocated: 0 B, Total deallocated: 0 B\n");
 
   free(p);
 }
 
 TEST_F(
-    ReallocStub,
-    should_not_influence_on_trace_results_even_if_orignal_function_return_any_not_null_ptr_when_incoming_size_is_zero_and_ptr_is_null)
+  ReallocStub,
+  should_not_influence_on_trace_results_even_if_orignal_function_return_any_not_null_ptr_when_incoming_size_is_zero_and_ptr_is_null)
 {
   void *p = realloc(nullptr, 0);
   free(p);
diff --git a/runtime/contrib/heap_trace/tests/src/symbol_searcher_test.cc b/runtime/contrib/heap_trace/tests/src/symbol_searcher_test.cc
index 59fdeed..9ed9331 100644
--- a/runtime/contrib/heap_trace/tests/src/symbol_searcher_test.cc
+++ b/runtime/contrib/heap_trace/tests/src/symbol_searcher_test.cc
@@ -70,7 +70,7 @@ TEST_F(SymbolSearcher,
   fs::path pathToTestSample2 = exePath() / "libtest_sample2.so";
   void *test_sample2_handle = dlopen(pathToTestSample2.c_str(), RTLD_NOW);
   void *func_addr_in_test_sample2 =
-      dlsym(test_sample2_handle, "funcWhichCallFuncDefinedInTestSample3");
+    dlsym(test_sample2_handle, "funcWhichCallFuncDefinedInTestSample3");
 
   ASSERT_TRUE(test_sample2_handle);
   ASSERT_TRUE((void *)funcDefinedInTestSample3_ButWrappedInTestSample1 !=
diff --git a/runtime/contrib/heap_trace/tests/src/trace_test.cc b/runtime/contrib/heap_trace/tests/src/trace_test.cc
index 1cf4c53..4f359bb 100644
--- a/runtime/contrib/heap_trace/tests/src/trace_test.cc
+++ b/runtime/contrib/heap_trace/tests/src/trace_test.cc
@@ -114,15 +114,15 @@ TEST_F(Trace, should_work_correctly_in_multithreaded_environment)
   GlobalTrace.reset();
 
   string thisShouldBeInLogFile =
-      "Total allocated: " +
-      to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) +
-      " B, Total deallocated: " +
-      to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) + " B\n";
+    "Total allocated: " +
+    to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) +
+    " B, Total deallocated: " +
+    to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) + " B\n";
   string andThisToo =
-      "Total allocated: " +
-      to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) +
-      " B, Total deallocated: " +
-      to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) + " B\n";
+    "Total allocated: " +
+    to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) +
+    " B, Total deallocated: " +
+    to_string(numberOfThreads / 2 * numberOfEmulations * numberOfBytesPerOneEmulation) + " B\n";
   ASSERT_TRUE(getContentOfFile("./trace_test.log").find(thisShouldBeInLogFile) != string::npos);
   ASSERT_TRUE(getContentOfFile("./trace_test.log").find(andThisToo) != string::npos);
 }
diff --git a/runtime/contrib/labs/jniacl/src/jniacl_main.cc b/runtime/contrib/labs/jniacl/src/jniacl_main.cc
index 01b9289..1a34aa7 100644
--- a/runtime/contrib/labs/jniacl/src/jniacl_main.cc
+++ b/runtime/contrib/labs/jniacl/src/jniacl_main.cc
@@ -36,12 +36,13 @@ Java_com_samsung_testaclexec_ActivityMain_RunACLJNI(JNIEnv *env, jobject)
   TargetHint target_hint = TargetHint::OPENCL;
   bool autoinc = true;
 
-  graph << target_hint << Tensor(TensorInfo(TensorShape(3U, 3U, 1U, 1U), 1, DataType::F32),
-                                 std::unique_ptr<InputAccessor>(new InputAccessor(autoinc)))
+  graph << target_hint
+        << Tensor(TensorInfo(TensorShape(3U, 3U, 1U, 1U), 1, DataType::F32),
+                  std::unique_ptr<InputAccessor>(new InputAccessor(autoinc)))
         << arm_compute::graph::ConvolutionLayer(
-               3U, 3U, 1U, std::unique_ptr<WeightAccessor>(new WeightAccessor(autoinc)),
-               std::unique_ptr<BiasAccessor>(new BiasAccessor()),
-               arm_compute::PadStrideInfo(1, 1, 0, 0))
+             3U, 3U, 1U, std::unique_ptr<WeightAccessor>(new WeightAccessor(autoinc)),
+             std::unique_ptr<BiasAccessor>(new BiasAccessor()),
+             arm_compute::PadStrideInfo(1, 1, 0, 0))
         << Tensor(std::unique_ptr<OutputAccessor>(new OutputAccessor()));
 
   graph.run();
diff --git a/runtime/contrib/labs/opencl_test/src/opencl_test.cc b/runtime/contrib/labs/opencl_test/src/opencl_test.cc
index 1faa914..6838183 100644
--- a/runtime/contrib/labs/opencl_test/src/opencl_test.cc
+++ b/runtime/contrib/labs/opencl_test/src/opencl_test.cc
@@ -199,7 +199,7 @@ void checkContextMem()
   try
   {
     auto kernel_functor = cl::KernelFunctor<cl_int, cl::Buffer, cl::Buffer, cl_int, cl_int>(
-        gpu.program_, "memory_test"); // name should be same as cl function name
+      gpu.program_, "memory_test"); // name should be same as cl function name
 
     // create a queue per device and queue a kernel job
 
@@ -256,7 +256,7 @@ void printHelp()
   std::cout << "opencl information: \n\n";
   std::cout << "\t -h : help\n";
   std::cout
-      << "\t -g : print if memory map is shared among devices in GPU (in default platform)\n\n";
+    << "\t -g : print if memory map is shared among devices in GPU (in default platform)\n\n";
   std::cout << "\t -s : test for synchronized work by two devices in a GPU\n\n";
 }
 
@@ -270,7 +270,7 @@ void printHelp()
 int kernel_idx[MAX_DEVICE_NUM];
 unsigned char kernel_completed = 0x00; // bit 0 = 1 means kernel by device[0] was completed.
 unsigned char
-    kernel_completed_flag; // if comparing kernel_completed with this var, all kernels are completed
+  kernel_completed_flag; // if comparing kernel_completed with this var, all kernels are completed
 int device_num;
 std::mutex kernel_complete_handler_mutex;
 
@@ -319,7 +319,7 @@ void testSync()
   try
   {
     auto kernel_functor = cl::KernelFunctor<cl::Buffer, cl_int>(
-        gpu.program_, "test"); // name should be same as cl function name
+      gpu.program_, "test"); // name should be same as cl function name
 
     // variable init
     cl::Event ev[MAX_DEVICE_NUM];
diff --git a/runtime/contrib/labs/tflite_examples/src/conv.cpp b/runtime/contrib/labs/tflite_examples/src/conv.cpp
index e8542c3..0b5f946 100644
--- a/runtime/contrib/labs/tflite_examples/src/conv.cpp
+++ b/runtime/contrib/labs/tflite_examples/src/conv.cpp
@@ -217,7 +217,7 @@ int main(int argc, char **argv)
   // Configure Filter
   const uint32_t kernel_size = KER_N * KER_C * KER_H * KER_W;
   float kernel_data[kernel_size] = {
-      0.0f,
+    0.0f,
   };
 
   // Fill kernel data in NHWC order
@@ -243,13 +243,13 @@ int main(int argc, char **argv)
   }
 
   interp.SetTensorParametersReadOnly(
-      2, kTfLiteFloat32 /* type */, "filter" /* name */, {KER_N, KER_H, KER_W, KER_C} /* dims */,
-      quantization, reinterpret_cast<const char *>(kernel_data), sizeof(kernel_data));
+    2, kTfLiteFloat32 /* type */, "filter" /* name */, {KER_N, KER_H, KER_W, KER_C} /* dims */,
+    quantization, reinterpret_cast<const char *>(kernel_data), sizeof(kernel_data));
 
   // Configure Bias
   const uint32_t bias_size = bias.size();
   float bias_data[bias_size] = {
-      0.0f,
+    0.0f,
   };
 
   // Fill bias data
diff --git a/runtime/contrib/style_transfer_app/src/bitmap_helper.cc b/runtime/contrib/style_transfer_app/src/bitmap_helper.cc
index 6211ea4..0f687b2 100644
--- a/runtime/contrib/style_transfer_app/src/bitmap_helper.cc
+++ b/runtime/contrib/style_transfer_app/src/bitmap_helper.cc
@@ -49,10 +49,10 @@ unsigned char *BitmapHelper::createBitmapFileHeader(int height, int width, int p
   int fileSize = fileHeaderSize + infoHeaderSize + (bytesPerPixel * width + paddingSize) * height;
 
   static unsigned char fileHeader[] = {
-      0, 0,       /// signature
-      0, 0, 0, 0, /// image file size in bytes
-      0, 0, 0, 0, /// reserved
-      0, 0, 0, 0, /// start of pixel array
+    0, 0,       /// signature
+    0, 0, 0, 0, /// image file size in bytes
+    0, 0, 0, 0, /// reserved
+    0, 0, 0, 0, /// start of pixel array
   };
 
   fileHeader[0] = (unsigned char)('B');
@@ -69,17 +69,17 @@ unsigned char *BitmapHelper::createBitmapFileHeader(int height, int width, int p
 unsigned char *BitmapHelper::createBitmapInfoHeader(int height, int width)
 {
   static unsigned char infoHeader[] = {
-      0, 0, 0, 0, /// header size
-      0, 0, 0, 0, /// image width
-      0, 0, 0, 0, /// image height
-      0, 0,       /// number of color planes
-      0, 0,       /// bits per pixel
-      0, 0, 0, 0, /// compression
-      0, 0, 0, 0, /// image size
-      0, 0, 0, 0, /// horizontal resolution
-      0, 0, 0, 0, /// vertical resolution
-      0, 0, 0, 0, /// colors in color table
-      0, 0, 0, 0, /// important color count
+    0, 0, 0, 0, /// header size
+    0, 0, 0, 0, /// image width
+    0, 0, 0, 0, /// image height
+    0, 0,       /// number of color planes
+    0, 0,       /// bits per pixel
+    0, 0, 0, 0, /// compression
+    0, 0, 0, 0, /// image size
+    0, 0, 0, 0, /// horizontal resolution
+    0, 0, 0, 0, /// vertical resolution
+    0, 0, 0, 0, /// colors in color table
+    0, 0, 0, 0, /// important color count
   };
 
   // Minus height means top to bottom write
@@ -191,7 +191,7 @@ int BitmapHelper::read_bmp(const std::string &input_bmp_name, std::vector<float>
   // Decode image, allocating tensor once the image size is known
   const uint8_t *bmp_pixels = &img_bytes[header_size];
   std::vector<uint8_t> bmp =
-      decode_bmp(bmp_pixels, row_size, width, abs(height), channels, top_down);
+    decode_bmp(bmp_pixels, row_size, width, abs(height), channels, top_down);
   for (uint32_t j = 0; j < bmp.size(); j++)
   {
     input.push_back(static_cast<float>(bmp[j]));
diff --git a/runtime/contrib/style_transfer_app/src/jpeg_helper.cc b/runtime/contrib/style_transfer_app/src/jpeg_helper.cc
index ed5ae25..1554524 100644
--- a/runtime/contrib/style_transfer_app/src/jpeg_helper.cc
+++ b/runtime/contrib/style_transfer_app/src/jpeg_helper.cc
@@ -26,7 +26,7 @@ namespace StyleTransferApp
 {
 
 JpegHelper::JpegHelper(int bytes_per_pixel, J_COLOR_SPACE color_space)
-    : _bytes_per_pixel(bytes_per_pixel), _color_space(color_space)
+  : _bytes_per_pixel(bytes_per_pixel), _color_space(color_space)
 {
   // DO NOTHING
 }
diff --git a/runtime/contrib/style_transfer_app/src/style_transfer_app.cc b/runtime/contrib/style_transfer_app/src/style_transfer_app.cc
index eed0c42..ab8735d 100644
--- a/runtime/contrib/style_transfer_app/src/style_transfer_app.cc
+++ b/runtime/contrib/style_transfer_app/src/style_transfer_app.cc
@@ -68,10 +68,10 @@ uint64_t num_elems(const nnfw_tensorinfo *ti)
 NNFW_STATUS resolve_op_backend(nnfw_session *session)
 {
   static std::unordered_map<std::string, std::string> operation_map = {
-      {"TRANSPOSE_CONV", "OP_BACKEND_TransposeConv"},      {"CONV_2D", "OP_BACKEND_Conv2D"},
-      {"DEPTHWISE_CONV_2D", "OP_BACKEND_DepthwiseConv2D"}, {"MEAN", "OP_BACKEND_Mean"},
-      {"AVERAGE_POOL_2D", "OP_BACKEND_AvgPool2D"},         {"MAX_POOL_2D", "OP_BACKEND_MaxPool2D"},
-      {"INSTANCE_NORM", "OP_BACKEND_InstanceNorm"},        {"ADD", "OP_BACKEND_Add"}};
+    {"TRANSPOSE_CONV", "OP_BACKEND_TransposeConv"},      {"CONV_2D", "OP_BACKEND_Conv2D"},
+    {"DEPTHWISE_CONV_2D", "OP_BACKEND_DepthwiseConv2D"}, {"MEAN", "OP_BACKEND_Mean"},
+    {"AVERAGE_POOL_2D", "OP_BACKEND_AvgPool2D"},         {"MAX_POOL_2D", "OP_BACKEND_MaxPool2D"},
+    {"INSTANCE_NORM", "OP_BACKEND_InstanceNorm"},        {"ADD", "OP_BACKEND_Add"}};
 
   for (auto i : operation_map)
   {
diff --git a/runtime/contrib/tflite_classify/src/ImageClassifier.cc b/runtime/contrib/tflite_classify/src/ImageClassifier.cc
index fae4f06..1d92d6c 100644
--- a/runtime/contrib/tflite_classify/src/ImageClassifier.cc
+++ b/runtime/contrib/tflite_classify/src/ImageClassifier.cc
@@ -24,9 +24,9 @@ ImageClassifier::ImageClassifier(const std::string &model_file, const std::strin
                                  const int input_size, const int image_mean, const int image_std,
                                  const std::string &input_name, const std::string &output_name,
                                  const bool use_nnapi)
-    : _inference(new InferenceInterface(model_file, use_nnapi)), _input_size(input_size),
-      _image_mean(image_mean), _image_std(image_std), _input_name(input_name),
-      _output_name(output_name)
+  : _inference(new InferenceInterface(model_file, use_nnapi)), _input_size(input_size),
+    _image_mean(image_mean), _image_std(image_std), _input_name(input_name),
+    _output_name(output_name)
 {
   // Load label
   std::ifstream label_stream(label_file.c_str());
diff --git a/runtime/contrib/tflite_classify/src/InferenceInterface.cc b/runtime/contrib/tflite_classify/src/InferenceInterface.cc
index 1609434..562ff2a 100644
--- a/runtime/contrib/tflite_classify/src/InferenceInterface.cc
+++ b/runtime/contrib/tflite_classify/src/InferenceInterface.cc
@@ -20,7 +20,7 @@ using namespace tflite;
 using namespace tflite::ops::builtin;
 
 InferenceInterface::InferenceInterface(const std::string &model_file, const bool use_nnapi)
-    : _interpreter(nullptr), _model(nullptr), _sess(nullptr)
+  : _interpreter(nullptr), _model(nullptr), _sess(nullptr)
 {
   // Load model
   StderrReporter error_reporter;
diff --git a/runtime/contrib/tflite_classify/src/tflite_classify.cc b/runtime/contrib/tflite_classify/src/tflite_classify.cc
index 51758e2..7bed778 100644
--- a/runtime/contrib/tflite_classify/src/tflite_classify.cc
+++ b/runtime/contrib/tflite_classify/src/tflite_classify.cc
@@ -60,9 +60,8 @@ int main(const int argc, char **argv)
   }
 
   // Create ImageClassifier
-  std::unique_ptr<ImageClassifier> classifier(
-      new ImageClassifier(MODEL_FILE, LABEL_FILE, INPUT_SIZE, IMAGE_MEAN, IMAGE_STD, INPUT_NAME,
-                          OUTPUT_NAME, use_nnapi));
+  std::unique_ptr<ImageClassifier> classifier(new ImageClassifier(
+    MODEL_FILE, LABEL_FILE, INPUT_SIZE, IMAGE_MEAN, IMAGE_STD, INPUT_NAME, OUTPUT_NAME, use_nnapi));
 
   // Cam setting
   cv::VideoCapture cap(0);
diff --git a/runtime/libs/.clang-format b/runtime/libs/.clang-format
new file mode 120000
index 0000000..f761fe4
--- /dev/null
+++ b/runtime/libs/.clang-format
@@ -0,0 +1 @@
+../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/libs/benchmark/src/CsvWriter.cpp b/runtime/libs/benchmark/src/CsvWriter.cpp
index 5f47c65..6233129 100644
--- a/runtime/libs/benchmark/src/CsvWriter.cpp
+++ b/runtime/libs/benchmark/src/CsvWriter.cpp
@@ -35,7 +35,7 @@ CsvWriter::CsvWriter(const std::string &csv_filename) : CsvWriter(csv_filename,
 }
 
 CsvWriter::CsvWriter(const std::string &csv_filename, const std::vector<std::string> &header)
-    : _ofs(csv_filename), _header_size(header.size()), _col_idx(0), _row_idx(0)
+  : _ofs(csv_filename), _header_size(header.size()), _col_idx(0), _row_idx(0)
 {
   assert(csv_filename.empty() == false);
   assert(header.size() != 0);
diff --git a/runtime/libs/benchmark/src/MemoryPoller.cpp b/runtime/libs/benchmark/src/MemoryPoller.cpp
index 050b5b1..2f3c855 100644
--- a/runtime/libs/benchmark/src/MemoryPoller.cpp
+++ b/runtime/libs/benchmark/src/MemoryPoller.cpp
@@ -27,7 +27,7 @@ namespace benchmark
 {
 
 MemoryPoller::MemoryPoller(std::chrono::milliseconds duration, bool gpu_poll)
-    : _duration(duration), _run(false), _term(false), _gpu_poll(gpu_poll)
+  : _duration(duration), _run(false), _term(false), _gpu_poll(gpu_poll)
 {
   if (prepareMemoryPolling() == false)
     throw std::runtime_error("failed to prepare memory pooling");
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index e6cafb9..0356687 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -77,9 +77,9 @@ uint32_t averageMemoryKb(const benchmark::Phase &phase, int type)
   return average<uint32_t, uint32_t>(phase.memory[type]);
 }
 
-uint32_t peakMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE]
-                                         [benchmark::MemoryType::END_OF_MEM_TYPE],
-                    int type)
+uint32_t peakMemory(
+  const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE][benchmark::MemoryType::END_OF_MEM_TYPE],
+  int type)
 {
   using namespace benchmark;
   // tricky. handle WARMUP as EXECUTE
@@ -88,7 +88,7 @@ uint32_t peakMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE]
 }
 
 void printResultTime(
-    const double time[benchmark::PhaseEnum::END_OF_PHASE][benchmark::FigureType::END_OF_FIG_TYPE])
+  const double time[benchmark::PhaseEnum::END_OF_PHASE][benchmark::FigureType::END_OF_FIG_TYPE])
 {
   using namespace benchmark;
 
@@ -119,8 +119,8 @@ void printResultTime(
   std::cout << "===================================" << std::endl;
 }
 
-void printResultMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE]
-                                            [benchmark::MemoryType::END_OF_MEM_TYPE])
+void printResultMemory(
+  const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE][benchmark::MemoryType::END_OF_MEM_TYPE])
 {
   using namespace benchmark;
 
diff --git a/runtime/libs/misc/include/misc/feature/Index.h b/runtime/libs/misc/include/misc/feature/Index.h
index a361d8d..09d65a5 100644
--- a/runtime/libs/misc/include/misc/feature/Index.h
+++ b/runtime/libs/misc/include/misc/feature/Index.h
@@ -62,7 +62,7 @@ public:
    * @param[in]  col   The width index
    */
   Index(int32_t batch, int32_t ch, int32_t row, int32_t col)
-      : _batch{batch}, _ch{ch}, _row{row}, _col{col}
+    : _batch{batch}, _ch{ch}, _row{row}, _col{col}
   {
     // DO NOTHING
   }
diff --git a/runtime/libs/misc/include/misc/feature/Shape.h b/runtime/libs/misc/include/misc/feature/Shape.h
index 09881f5..2c31b45 100644
--- a/runtime/libs/misc/include/misc/feature/Shape.h
+++ b/runtime/libs/misc/include/misc/feature/Shape.h
@@ -64,7 +64,7 @@ struct Shape
    * @param[in]  width  The width value
    */
   Shape(int32_t batch, int32_t depth, int32_t height, int32_t width)
-      : N{batch}, C{depth}, H{height}, W{width}
+    : N{batch}, C{depth}, H{height}, W{width}
   {
     // DO NOTHING
   }
diff --git a/runtime/libs/misc/include/misc/kernel/Shape.h b/runtime/libs/misc/include/misc/kernel/Shape.h
index 27d6a8b..176db0a 100644
--- a/runtime/libs/misc/include/misc/kernel/Shape.h
+++ b/runtime/libs/misc/include/misc/kernel/Shape.h
@@ -55,7 +55,7 @@ struct Shape
    * @param[in] width The width index
    */
   Shape(int32_t count, int32_t depth, int32_t height, int32_t width)
-      : N{count}, C{depth}, H{height}, W{width}
+    : N{count}, C{depth}, H{height}, W{width}
   {
     // DO NOTHING
   }
diff --git a/runtime/libs/misc/include/misc/tensor/Object.h b/runtime/libs/misc/include/misc/tensor/Object.h
index cba4f1b..15ad6da 100644
--- a/runtime/libs/misc/include/misc/tensor/Object.h
+++ b/runtime/libs/misc/include/misc/tensor/Object.h
@@ -74,9 +74,8 @@ public:
       _values.resize(_shape.dim(0) * _stride.at(0));
 
       // Set 'value'
-      iterate(_shape) << [this, &fn](const Index &index) {
-        _values.at(_stride.offset(index)) = fn(_shape, index);
-      };
+      iterate(_shape) <<
+        [this, &fn](const Index &index) { _values.at(_stride.offset(index)) = fn(_shape, index); };
     }
   }
 
diff --git a/runtime/libs/misc/include/misc/tensor/Zipper.h b/runtime/libs/misc/include/misc/tensor/Zipper.h
index 8f0ec4a..b1ca3d0 100644
--- a/runtime/libs/misc/include/misc/tensor/Zipper.h
+++ b/runtime/libs/misc/include/misc/tensor/Zipper.h
@@ -48,7 +48,7 @@ public:
    * @param[in] rhs     @c Reader object of a tensor
    */
   Zipper(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs)
-      : _shape{shape}, _lhs{lhs}, _rhs{rhs}
+    : _shape{shape}, _lhs{lhs}, _rhs{rhs}
   {
     // DO NOTHING
   }
@@ -63,7 +63,7 @@ public:
   template <typename Callable> void zip(Callable cb) const
   {
     iterate(_shape) <<
-        [this, &cb](const Index &index) { cb(index, _lhs.at(index), _rhs.at(index)); };
+      [this, &cb](const Index &index) { cb(index, _lhs.at(index), _rhs.at(index)); };
   }
 
 private:
diff --git a/runtime/libs/misc/src/tensor/Comparator.cpp b/runtime/libs/misc/src/tensor/Comparator.cpp
index 80a18c1..5fcf38c 100644
--- a/runtime/libs/misc/src/tensor/Comparator.cpp
+++ b/runtime/libs/misc/src/tensor/Comparator.cpp
@@ -33,18 +33,18 @@ std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<fl
   std::vector<Diff<float>> res;
 
   zip(shape, expected, obtained) <<
-      [&](const Index &index, float expected_value, float obtained_value) {
-        if (!_compare_fn(expected_value, obtained_value))
-        {
-          res.emplace_back(index, expected_value, obtained_value);
-        }
-
-        // Update max_diff_index, if necessary
-        if (observer != nullptr)
-        {
-          observer->notify(index, expected_value, obtained_value);
-        }
-      };
+    [&](const Index &index, float expected_value, float obtained_value) {
+      if (!_compare_fn(expected_value, obtained_value))
+      {
+        res.emplace_back(index, expected_value, obtained_value);
+      }
+
+      // Update max_diff_index, if necessary
+      if (observer != nullptr)
+      {
+        observer->notify(index, expected_value, obtained_value);
+      }
+    };
 
   return res;
 }
diff --git a/runtime/libs/nnapi/include/NeuralNetworksShim.h b/runtime/libs/nnapi/include/NeuralNetworksShim.h
index 9cf52aa..2e8ccdb 100644
--- a/runtime/libs/nnapi/include/NeuralNetworksShim.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksShim.h
@@ -225,8 +225,8 @@ inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel *model, int
  * @return ANEURALNETWORKS_NO_ERROR if successful.
  */
 inline int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
-    ANeuralNetworksModel *model, int32_t index,
-    const ANeuralNetworksSymmPerChannelQuantParams *channelQuant)
+  ANeuralNetworksModel *model, int32_t index,
+  const ANeuralNetworksSymmPerChannelQuantParams *channelQuant)
 {
   LOAD_FUNCTION(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
   EXECUTE_FUNCTION_RETURN(model, index, channelQuant);
@@ -1218,7 +1218,7 @@ inline int ANeuralNetworksModel_setOperandExtensionData(ANeuralNetworksModel *mo
   LOAD_FUNCTION(ANeuralNetworksModel_setOperandExtensionData);
   EXECUTE_FUNCTION_RETURN(model, index, data, length);
 }
-
+#if __ANDROID_API__ >= 30
 /**
  * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
  *
@@ -1548,7 +1548,7 @@ inline int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory *src,
   LOAD_FUNCTION(ANeuralNetworksMemory_copy);
   EXECUTE_FUNCTION_RETURN(src, dst);
 }
-
+#endif // __ANDROID_API__ >= 30
 /**/
 
 #endif // __NEURAL_NETWORKS_SHIM_H__
diff --git a/runtime/libs/nnapi/include/NeuralNetworksTypes.h b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
index 2e05687..35c7a58 100644
--- a/runtime/libs/nnapi/include/NeuralNetworksTypes.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
@@ -56,12 +56,12 @@ typedef int (*ANeuralNetworksModel_setOperandValue_fn)(ANeuralNetworksModel *mod
                                                        const void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_fn)(
-    ANeuralNetworksModel *model, int32_t index,
-    const ANeuralNetworksSymmPerChannelQuantParams *channelQuant);
+  ANeuralNetworksModel *model, int32_t index,
+  const ANeuralNetworksSymmPerChannelQuantParams *channelQuant);
 
 typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel *model, int32_t index, const ANeuralNetworksMemory *memory, size_t offset,
-    size_t length);
+  ANeuralNetworksModel *model, int32_t index, const ANeuralNetworksMemory *memory, size_t offset,
+  size_t length);
 
 typedef int (*ANeuralNetworksModel_addOperation_fn)(ANeuralNetworksModel *model,
                                                     ANeuralNetworksOperationType type,
@@ -88,8 +88,8 @@ typedef int (*ANeuralNetworksExecution_setInput_fn)(ANeuralNetworksExecution *ex
                                                     const void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
-    const ANeuralNetworksMemory *memory, size_t offset, size_t length);
+  ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
+  const ANeuralNetworksMemory *memory, size_t offset, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setOutput_fn)(ANeuralNetworksExecution *execution,
                                                      int32_t index,
@@ -97,8 +97,8 @@ typedef int (*ANeuralNetworksExecution_setOutput_fn)(ANeuralNetworksExecution *e
                                                      void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
-    const ANeuralNetworksMemory *memory, size_t offset, size_t length);
+  ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
+  const ANeuralNetworksMemory *memory, size_t offset, size_t length);
 
 typedef int (*ANeuralNetworksExecution_startCompute_fn)(ANeuralNetworksExecution *execution,
                                                         ANeuralNetworksEvent **event);
@@ -125,35 +125,39 @@ typedef int (*ANeuralNetworksDevice_getFeatureLevel_fn)(const ANeuralNetworksDev
                                                         int64_t *featureLevel);
 
 typedef int (*ANeuralNetworksModel_getSupportedOperationsForDevices_fn)(
-    const ANeuralNetworksModel *model, const ANeuralNetworksDevice *const *devices,
-    uint32_t numDevices, bool *supportedOps);
+  const ANeuralNetworksModel *model, const ANeuralNetworksDevice *const *devices,
+  uint32_t numDevices, bool *supportedOps);
 
 typedef int (*ANeuralNetworksCompilation_createForDevices_fn)(
-    ANeuralNetworksModel *model, const ANeuralNetworksDevice *const *devices, uint32_t numDevices,
-    ANeuralNetworksCompilation **compilation);
+  ANeuralNetworksModel *model, const ANeuralNetworksDevice *const *devices, uint32_t numDevices,
+  ANeuralNetworksCompilation **compilation);
 
 typedef int (*ANeuralNetworksCompilation_setCaching_fn)(ANeuralNetworksCompilation *compilation,
                                                         const char *cacheDir, const uint8_t *token);
 
+#if __ANDROID_API__ >= 30
 typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(ANeuralNetworksCompilation *compilation,
                                                         uint64_t duration);
 
 typedef int (*ANeuralNetworksCompilation_setPriority_fn)(ANeuralNetworksCompilation *compilation,
                                                          int priority);
+#endif // __ANDROID_API__ >= 30
 
 typedef int (*ANeuralNetworksExecution_compute_fn)(ANeuralNetworksExecution *execution);
 
+#if __ANDROID_API__ >= 30
 typedef int (*ANeuralNetworksExecution_setTimeout_fn)(ANeuralNetworksExecution *execution,
                                                       uint64_t duration);
 
 typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(ANeuralNetworksExecution *execution,
                                                           uint64_t duration);
+#endif // __ANDROID_API__ >= 30
 
 typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(ANeuralNetworksExecution *execution,
                                                                 int32_t index, uint32_t *rank);
 
 typedef int (*ANeuralNetworksExecution_getOutputOperandDimensions_fn)(
-    ANeuralNetworksExecution *execution, int32_t index, uint32_t *dimensions);
+  ANeuralNetworksExecution *execution, int32_t index, uint32_t *dimensions);
 
 typedef int (*ANeuralNetworksBurst_create_fn)(ANeuralNetworksCompilation *compilation,
                                               ANeuralNetworksBurst **burst);
@@ -182,24 +186,25 @@ typedef int (*ANeuralNetworksModel_getExtensionOperandType_fn)(ANeuralNetworksMo
                                                                int32_t *type);
 
 typedef int (*ANeuralNetworksModel_getExtensionOperationType_fn)(
-    ANeuralNetworksModel *model, const char *extensionName, uint16_t operationCodeWithinExtension,
-    ANeuralNetworksOperationType *type);
+  ANeuralNetworksModel *model, const char *extensionName, uint16_t operationCodeWithinExtension,
+  ANeuralNetworksOperationType *type);
 
 typedef int (*ANeuralNetworksModel_setOperandExtensionData_fn)(ANeuralNetworksModel *model,
                                                                int32_t index, const void *data,
                                                                size_t length);
 
+#if __ANDROID_API__ >= 30
 typedef int (*ANeuralNetworksMemoryDesc_create_fn)(ANeuralNetworksMemoryDesc **desc);
 
 typedef void (*ANeuralNetworksMemoryDesc_free_fn)(ANeuralNetworksMemoryDesc *desc);
 
 typedef int (*ANeuralNetworksMemoryDesc_addInputRole_fn)(
-    ANeuralNetworksMemoryDesc *desc, const ANeuralNetworksCompilation *compilation, int32_t index,
-    float frequency);
+  ANeuralNetworksMemoryDesc *desc, const ANeuralNetworksCompilation *compilation, int32_t index,
+  float frequency);
 
 typedef int (*ANeuralNetworksMemoryDesc_addOutputRole_fn)(
-    ANeuralNetworksMemoryDesc *desc, const ANeuralNetworksCompilation *compilation, uint32_t index,
-    float frequency);
+  ANeuralNetworksMemoryDesc *desc, const ANeuralNetworksCompilation *compilation, uint32_t index,
+  float frequency);
 
 typedef int (*ANeuralNetworksMemoryDesc_setDimensions_fn)(ANeuralNetworksMemoryDesc *desc,
                                                           uint32_t rank,
@@ -212,5 +217,5 @@ typedef int (*ANeuralNetworksMemory_createFromDesc_fn)(const ANeuralNetworksMemo
 
 typedef int (*ANeuralNetworksMemory_copy_fn)(const ANeuralNetworksMemory *src,
                                              const ANeuralNetworksMemory *dst);
-
+#endif // __ANDROID_API__ >= 30
 #endif // __NEURAL_NETWORKS_TYPES_H__
diff --git a/runtime/libs/rua/dyn/src/DynamicBinder.cpp b/runtime/libs/rua/dyn/src/DynamicBinder.cpp
index fa3f0bb..f49892d 100644
--- a/runtime/libs/rua/dyn/src/DynamicBinder.cpp
+++ b/runtime/libs/rua/dyn/src/DynamicBinder.cpp
@@ -97,8 +97,8 @@ typedef int (*ANeuralNetworksModel_setOperandValue_fn)(ANeuralNetworksModel *mod
                                                        const void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel *model, int32_t index, const ANeuralNetworksMemory *memory, size_t offset,
-    size_t length);
+  ANeuralNetworksModel *model, int32_t index, const ANeuralNetworksMemory *memory, size_t offset,
+  size_t length);
 
 typedef int (*ANeuralNetworksModel_addOperation_fn)(ANeuralNetworksModel *model,
                                                     ANeuralNetworksOperationType type,
@@ -242,8 +242,8 @@ typedef int (*ANeuralNetworksExecution_setInput_fn)(ANeuralNetworksExecution *ex
                                                     const void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
-    const ANeuralNetworksMemory *memory, size_t offset, size_t length);
+  ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
+  const ANeuralNetworksMemory *memory, size_t offset, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setOutput_fn)(ANeuralNetworksExecution *execution,
                                                      int32_t index,
@@ -251,8 +251,8 @@ typedef int (*ANeuralNetworksExecution_setOutput_fn)(ANeuralNetworksExecution *e
                                                      void *buffer, size_t length);
 
 typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
-    const ANeuralNetworksMemory *memory, size_t offset, size_t length);
+  ANeuralNetworksExecution *execution, int32_t index, const ANeuralNetworksOperandType *type,
+  const ANeuralNetworksMemory *memory, size_t offset, size_t length);
 
 typedef int (*ANeuralNetworksExecution_startCompute_fn)(ANeuralNetworksExecution *execution,
                                                         ANeuralNetworksEvent **event);
diff --git a/runtime/libs/tflite/include/tflite/Diff.h b/runtime/libs/tflite/include/tflite/Diff.h
index fdc1a31..1c35b34 100644
--- a/runtime/libs/tflite/include/tflite/Diff.h
+++ b/runtime/libs/tflite/include/tflite/Diff.h
@@ -47,7 +47,7 @@ public:
    * @param[in] comparator   Comparator object for tensor comparation
    */
   TfLiteInterpMatchApp(const nnfw::misc::tensor::Comparator &comparator)
-      : _verbose{false}, _comparator(comparator)
+    : _verbose{false}, _comparator(comparator)
   {
     // DO NOTHING
   }
diff --git a/runtime/libs/tflite/include/tflite/RandomTestRunner.h b/runtime/libs/tflite/include/tflite/RandomTestRunner.h
index c0b304c..abbf3b2 100644
--- a/runtime/libs/tflite/include/tflite/RandomTestRunner.h
+++ b/runtime/libs/tflite/include/tflite/RandomTestRunner.h
@@ -55,7 +55,7 @@ public:
    * @param[in] quantization  TfLiteQuantizationParams type to represent quantization value
    */
   RandomTestRunner(uint32_t seed, const RandomTestParam &param)
-      : _randgen{seed, 0.0f, 2.0f}, _param{param}
+    : _randgen{seed, 0.0f, 2.0f}, _param{param}
   {
     // DO NOTHING
   }
diff --git a/runtime/libs/tflite/include/tflite/TensorLogger.h b/runtime/libs/tflite/include/tflite/TensorLogger.h
index a824c34..0837dfc 100644
--- a/runtime/libs/tflite/include/tflite/TensorLogger.h
+++ b/runtime/libs/tflite/include/tflite/TensorLogger.h
@@ -107,9 +107,8 @@ private:
     const TfLiteTensor *tensor = interp.tensor(id);
 
     _outfile << "# tensor name: " << tensor->name << std::endl;
-    _outfile << "# tflite::interpreter.tensor(" << id << ") -> "
-                                                         "tensor_value_gen["
-             << log_index << "]" << std::endl;
+    _outfile << "# tflite::interpreter.tensor(" << id << ") -> tensor_value_gen[" << log_index
+             << "]" << std::endl;
 
     if (tensor->type == kTfLiteInt32)
     {
diff --git a/runtime/libs/tflite/src/Diff.cpp b/runtime/libs/tflite/src/Diff.cpp
index 39f9943..2d2b66e 100644
--- a/runtime/libs/tflite/src/Diff.cpp
+++ b/runtime/libs/tflite/src/Diff.cpp
@@ -29,9 +29,9 @@ class DiffSummary : public nnfw::misc::tensor::Comparator::Observer
 {
 public:
   DiffSummary()
-      : max_abs_diff_index(0), max_abs_diff_expected{0.0f}, max_abs_diff_obtained{0.0f},
-        max_abs_diff_value{0.0f}, max_rel_diff_index(0), max_rel_diff_expected{0.0f},
-        max_rel_diff_obtained{0.0f}, max_rel_diff_value{0.0f}
+    : max_abs_diff_index(0), max_abs_diff_expected{0.0f}, max_abs_diff_obtained{0.0f},
+      max_abs_diff_value{0.0f}, max_rel_diff_index(0), max_rel_diff_expected{0.0f},
+      max_rel_diff_obtained{0.0f}, max_rel_diff_value{0.0f}
   {
     // DO NOTHING
   }
@@ -86,12 +86,12 @@ bool TfLiteInterpMatchApp::compareSingleTensorView(const nnfw::tflite::TensorVie
   using nnfw::misc::tensor::zip;
 
   zip(expected.shape(), expected, obtained)
-      << [&](const Index &index, T expected_value, T obtained_value) {
-           if (expected_value != obtained_value)
-           {
-             diffs.emplace_back(index, expected_value, obtained_value);
-           }
-         };
+    << [&](const Index &index, T expected_value, T obtained_value) {
+         if (expected_value != obtained_value)
+         {
+           diffs.emplace_back(index, expected_value, obtained_value);
+         }
+       };
 
   // TODO Unify summary generation code
   if (diffs.size() == 0)
@@ -121,8 +121,8 @@ bool TfLiteInterpMatchApp::compareSingleTensorView(const nnfw::tflite::TensorVie
 
 template <>
 bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
-    const nnfw::tflite::TensorView<float> &expected,
-    const nnfw::tflite::TensorView<float> &obtained, int id) const
+  const nnfw::tflite::TensorView<float> &expected, const nnfw::tflite::TensorView<float> &obtained,
+  int id) const
 {
   DiffSummary summary;
 
diff --git a/runtime/libs/tflite/src/RandomTestRunner.cpp b/runtime/libs/tflite/src/RandomTestRunner.cpp
index f7fccbf..3fa9a97 100644
--- a/runtime/libs/tflite/src/RandomTestRunner.cpp
+++ b/runtime/libs/tflite/src/RandomTestRunner.cpp
@@ -68,12 +68,12 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     int32_t value = 0;
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             // TODO Generate random values
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-             ++value;
-           };
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           // TODO Generate random values
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+           ++value;
+         };
   };
 
   // Generate singed 32-bit integer (s32) input
@@ -89,11 +89,11 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     int32_t value = 0;
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             // TODO Generate random values
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           // TODO Generate random values
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   initializers[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -106,19 +106,19 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<uint8_t>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<uint8_t>);
     const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
                                                    std::bind(fp, _randgen, _1, _2));
     assert(tfl_interp_view.shape() == data.shape());
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             const auto value = data.at(ind);
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           const auto value = data.at(ind);
 
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   reseters[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -131,8 +131,8 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<uint8_t>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<uint8_t>);
     const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
                                                    std::bind(fp, _randgen, _1, _2));
     assert(tfl_interp_view.shape() == data.shape());
@@ -140,10 +140,10 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     uint8_t value = 0;
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   initializers[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -156,20 +156,20 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<float (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<float>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<float>);
     const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
                                                  std::bind(fp, _randgen, _1, _2));
 
     assert(tfl_interp_view.shape() == data.shape());
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             const auto value = data.at(ind);
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           const auto value = data.at(ind);
 
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   reseters[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -182,8 +182,8 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<float (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<float>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<float>);
     const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
                                                  std::bind(fp, _randgen, _1, _2));
 
@@ -192,10 +192,10 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     float value = 0;
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   initializers[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -208,20 +208,20 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<bool>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<bool>);
     const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
                                                 std::bind(fp, _randgen, _1, _2));
 
     assert(tfl_interp_view.shape() == data.shape());
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             const auto value = data.at(ind);
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           const auto value = data.at(ind);
 
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   reseters[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
@@ -234,8 +234,8 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     assert(tfl_interp_view.shape() == nnapi_view.shape());
 
     auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-        &nnfw::misc::RandomGenerator::generate<bool>);
+      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+      &nnfw::misc::RandomGenerator::generate<bool>);
     const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
                                                 std::bind(fp, _randgen, _1, _2));
 
@@ -244,10 +244,10 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
     bool value = false;
 
     nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-        << [&](const nnfw::misc::tensor::Index &ind) {
-             tfl_interp_view.at(ind) = value;
-             nnapi_view.at(ind) = value;
-           };
+      << [&](const nnfw::misc::tensor::Index &ind) {
+           tfl_interp_view.at(ind) = value;
+           nnapi_view.at(ind) = value;
+         };
   };
 
   // Fill IFM with random numbers
diff --git a/runtime/onert/api/.clang-format b/runtime/onert/api/.clang-format
new file mode 120000
index 0000000..83185fe
--- /dev/null
+++ b/runtime/onert/api/.clang-format
@@ -0,0 +1 @@
+../../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
index 76380b4..6eb7e6b 100644
--- a/runtime/onert/api/include/nnfw.h
+++ b/runtime/onert/api/include/nnfw.h
@@ -64,7 +64,8 @@ typedef struct nnfw_session nnfw_session;
  *
  * The type of tensor represented in {@link nnfw_tensorinfo}
  */
-typedef enum {
+typedef enum
+{
   /** A tensor of 32 bit floating point */
   NNFW_TYPE_TENSOR_FLOAT32 = 0,
   /** A tensor of 32 bit signed integer */
@@ -96,7 +97,8 @@ typedef enum {
 /**
  * @brief Result values returned from a call to an API function
  */
-typedef enum {
+typedef enum
+{
   /** Successful */
   NNFW_STATUS_NO_ERROR = 0,
   /**
@@ -117,7 +119,8 @@ typedef enum {
 /**
  * @brief Data format of a tensor
  */
-typedef enum {
+typedef enum
+{
   /** Don't care layout */
   NNFW_LAYOUT_NONE = 0,
   /**
@@ -135,7 +138,8 @@ typedef enum {
 /**
  * @brief Information ID for retrieving information on nnfw (e.g. version)
  */
-typedef enum {
+typedef enum
+{
   /** nnfw runtime version
    * Its value is uint32 in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch.
    */
diff --git a/runtime/onert/api/include/nnfw_internal.h b/runtime/onert/api/include/nnfw_internal.h
index eb4b6d6..a88e324 100644
--- a/runtime/onert/api/include/nnfw_internal.h
+++ b/runtime/onert/api/include/nnfw_internal.h
@@ -35,4 +35,13 @@ NNFW_STATUS nnfw_get_config(nnfw_session *session, const char *key, char *value,
  */
 NNFW_STATUS nnfw_load_circle_from_buffer(nnfw_session *session, uint8_t *buffer, size_t size);
 
+/**
+ * @brief Load a tflite/circle model from file.
+ *
+ * @param[in] session   session
+ * @param[in] file_path Path to model file. Model type(tflite/circle) is decided by file extension
+ * @return    NFNFW_STATUS
+ */
+NNFW_STATUS nnfw_load_model_from_modelfile(nnfw_session *session, const char *file_path);
+
 #endif // __NNFW_INTERNAL_H__
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 31c3890..28703c0 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01000b01
+#define NNFW_VERSION 0x01000c00
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/CustomKernel.cc b/runtime/onert/api/src/CustomKernel.cc
index 3f3a5d8..56525fe 100644
--- a/runtime/onert/api/src/CustomKernel.cc
+++ b/runtime/onert/api/src/CustomKernel.cc
@@ -65,7 +65,7 @@ public:
 };
 
 Kernel::Kernel(const nnfw_custom_eval evalFunction)
-    : _in_params(), _userdata(nullptr), _userdata_size(0), _evalFunction(evalFunction)
+  : _in_params(), _userdata(nullptr), _userdata_size(0), _evalFunction(evalFunction)
 {
 }
 
diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
index 835b207..4eba4ec 100644
--- a/runtime/onert/api/src/nnfw_api.cc
+++ b/runtime/onert/api/src/nnfw_api.cc
@@ -90,7 +90,7 @@ NNFW_STATUS nnfw_close_session(nnfw_session *session)
 NNFW_STATUS nnfw_load_model_from_file(nnfw_session *session, const char *pacakge_file_path)
 {
   NNFW_RETURN_ERROR_IF_NULL(session);
-  return session->load_model_from_file(pacakge_file_path);
+  return session->load_model_from_nnpackage(pacakge_file_path);
 }
 
 /*
@@ -350,6 +350,12 @@ NNFW_STATUS nnfw_load_circle_from_buffer(nnfw_session *session, uint8_t *buffer,
   return session->load_circle_from_buffer(buffer, size);
 }
 
+NNFW_STATUS nnfw_load_model_from_modelfile(nnfw_session *session, const char *file_path)
+{
+  NNFW_RETURN_ERROR_IF_NULL(session);
+  return session->load_model_from_modelfile(file_path);
+}
+
 NNFW_STATUS nnfw_input_tensorindex(nnfw_session *session, const char *tensorname, uint32_t *index)
 {
   NNFW_RETURN_ERROR_IF_NULL(session);
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index a4c69eb..c3fdb13 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -19,17 +19,19 @@
 #include "compiler/Compiler.h"
 #include "util/ConfigSource.h"
 #include "util/Exceptions.h"
+#include "util/logging.h"
 #include "exec/Execution.h"
 #include "circle_loader.h"
 #include "tflite_loader.h"
 #include "json/json.h"
 #include "ir/OpCode.h"
+#include "util/TracingCtx.h"
+
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include <dirent.h>
-#include <util/ConfigSource.h>
 #include <misc/string_helpers.h>
 
 /*
@@ -40,8 +42,11 @@
 #define MAX_PATH_LENGTH 1024
 #define MAX_TENSOR_NAME_LENGTH 64
 
+namespace
+{
+
 // Is null-terminating in length ?
-static bool null_terminating(const char *str, uint32_t length)
+bool null_terminating(const char *str, uint32_t length)
 {
   for (uint32_t i = 0; i < length; i++)
   {
@@ -53,7 +58,7 @@ static bool null_terminating(const char *str, uint32_t length)
   return false;
 }
 
-static onert::ir::Layout convertLayout(NNFW_LAYOUT layout)
+onert::ir::Layout convertLayout(NNFW_LAYOUT layout)
 {
   if (layout == NNFW_LAYOUT_CHANNELS_LAST)
   {
@@ -92,9 +97,70 @@ NNFW_STATUS getTensorIndexImpl(const onert::ir::Graph &graph, const char *tensor
   }
 }
 
+std::string trim(const std::string &value)
+{
+  std::string whitespace = " \t";
+  auto begin = value.find_first_not_of(whitespace);
+  if (begin == std::string::npos)
+    return ""; // no content
+
+  auto end = value.find_last_not_of(whitespace);
+  auto range = end - begin + 1;
+  return value.substr(begin, range);
+}
+
+using CfgKeyValues = std::unordered_map<std::string, std::string>;
+
+bool loadConfigure(const std::string cfgfile, CfgKeyValues &keyValues)
+{
+  std::ifstream ifs(cfgfile);
+  if (ifs.is_open())
+  {
+    std::string line;
+    while (std::getline(ifs, line))
+    {
+      auto cmtpos = line.find('#');
+      if (cmtpos != std::string::npos)
+      {
+        line = line.substr(0, cmtpos);
+      }
+      std::istringstream isline(line);
+      std::string key;
+      if (std::getline(isline, key, '='))
+      {
+        std::string value;
+        if (std::getline(isline, value))
+        {
+          key = trim(key);
+          keyValues[key] = trim(value);
+        }
+      }
+    }
+    ifs.close();
+    return true;
+  }
+  return false;
+}
+
+void setConfigKeyValues(const CfgKeyValues &keyValues)
+{
+  auto configsrc = std::make_unique<onert::util::GeneralConfigSource>();
+
+  for (auto it = keyValues.begin(); it != keyValues.end(); ++it)
+  {
+    VERBOSE(NNPKG_CONFIGS) << "(" << it->first << ") = (" << it->second << ")" << std::endl;
+    configsrc->set(it->first, it->second);
+  }
+
+  onert::util::config_source_ext(std::move(configsrc));
+}
+
+} // namespace
+
 nnfw_session::nnfw_session()
-    : _subgraphs{nullptr}, _execution{nullptr},
-      _kernel_registry{std::make_shared<onert::frontend::custom::KernelRegistry>()}
+  : _subgraphs{nullptr}, _execution{nullptr},
+    _kernel_registry{std::make_shared<onert::frontend::custom::KernelRegistry>()}, _tracing_ctx{
+                                                                                     nullptr}
 {
   // DO NOTHING
 }
@@ -122,13 +188,65 @@ NNFW_STATUS nnfw_session::load_circle_from_buffer(uint8_t *buffer, size_t size)
     return NNFW_STATUS_ERROR;
   }
 
-  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs);
+  _tracing_ctx = std::make_unique<onert::util::TracingCtx>(_subgraphs.get());
+
+  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs, _tracing_ctx.get());
+
+  _state = State::MODEL_LOADED;
+  return NNFW_STATUS_NO_ERROR;
+}
+
+NNFW_STATUS nnfw_session::load_model_from_modelfile(const char *model_file_path)
+{
+  if (!isStateInitialized())
+    return NNFW_STATUS_INVALID_STATE;
+
+  if (!model_file_path)
+  {
+    std::cerr << "Model file path is null." << std::endl;
+    return NNFW_STATUS_UNEXPECTED_NULL;
+  }
+
+  std::string filename{model_file_path};
+  if (filename.size() < 8) // .tflite or .circle
+  {
+    std::cerr << "Invalid model file path." << std::endl;
+    return NNFW_STATUS_ERROR;
+  }
+
+  std::string model_type = filename.substr(filename.size() - 7, 7);
+
+  try
+  {
+    if (model_type == ".tflite")
+    {
+      _subgraphs = onert::tflite_loader::loadModel(filename.c_str());
+    }
+    else if (model_type == ".circle")
+    {
+      _subgraphs = onert::circle_loader::loadModel(filename.c_str());
+    }
+    else
+    {
+      std::cerr << "Unsupported model type" << std::endl;
+      return NNFW_STATUS_ERROR;
+    }
+  }
+  catch (const std::exception &e)
+  {
+    std::cerr << "Error during model loading : " << e.what() << std::endl;
+    return NNFW_STATUS_ERROR;
+  }
+
+  _tracing_ctx = std::make_unique<onert::util::TracingCtx>(_subgraphs.get());
+
+  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs, _tracing_ctx.get());
 
   _state = State::MODEL_LOADED;
   return NNFW_STATUS_NO_ERROR;
 }
 
-NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
+NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
 {
   if (!isStateInitialized())
     return NNFW_STATUS_INVALID_STATE;
@@ -166,6 +284,18 @@ NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
     mfs >> root;
     const Json::Value &models = root["models"];
     const Json::Value &model_types = root["model-types"];
+    const Json::Value &configs = root["configs"];
+
+    if (!configs.empty() && !configs[0].empty())
+    {
+      auto filepath = package_dir + std::string("/metadata/") + configs[0].asCString();
+
+      CfgKeyValues keyValues;
+      if (loadConfigure(filepath, keyValues))
+      {
+        setConfigKeyValues(keyValues);
+      }
+    }
 
     auto model_file_path = package_dir + std::string("/") + models[0].asString(); // first model
     auto model_type = model_types[0].asString(); // first model's type
@@ -190,7 +320,9 @@ NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
     return NNFW_STATUS_ERROR;
   }
 
-  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs);
+  _tracing_ctx = std::make_unique<onert::util::TracingCtx>(_subgraphs.get());
+
+  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs, _tracing_ctx.get());
 
   _state = State::MODEL_LOADED;
   return NNFW_STATUS_NO_ERROR;
@@ -225,7 +357,7 @@ NNFW_STATUS nnfw_session::prepare()
   {
     _subgraphs.reset();
     std::shared_ptr<onert::exec::ExecutorMap> executors = _compiler->compile();
-    _execution = std::make_shared<onert::exec::Execution>(executors);
+    _execution = std::make_unique<onert::exec::Execution>(executors);
   }
   catch (const std::exception &e)
   {
@@ -308,8 +440,8 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
   if (!buffer && length != 0)
   {
     std::cerr
-        << "Error during nnfw_session::set_input : given buffer is NULL but the length is not 0"
-        << std::endl;
+      << "Error during nnfw_session::set_input : given buffer is NULL but the length is not 0"
+      << std::endl;
     return NNFW_STATUS_ERROR;
   }
 
@@ -337,8 +469,8 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
   if (!buffer && length != 0)
   {
     std::cerr
-        << "Error during nnfw_session::set_output : given buffer is NULL but the length is not 0"
-        << std::endl;
+      << "Error during nnfw_session::set_output : given buffer is NULL but the length is not 0"
+      << std::endl;
     return NNFW_STATUS_ERROR;
   }
 
diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h
index 604ba38..a50ac72 100644
--- a/runtime/onert/api/src/nnfw_api_internal.h
+++ b/runtime/onert/api/src/nnfw_api_internal.h
@@ -21,6 +21,7 @@
 #include "nnfw_experimental.h"
 
 #include <util/GeneralConfigSource.h>
+#include <util/TracingCtx.h>
 
 #include <string>
 #include <memory>
@@ -100,7 +101,7 @@ public:
   nnfw_session();
   ~nnfw_session();
 
-  NNFW_STATUS load_model_from_file(const char *package_file_path);
+  NNFW_STATUS load_model_from_nnpackage(const char *package_file_path);
   NNFW_STATUS prepare();
   NNFW_STATUS run();
 
@@ -132,6 +133,7 @@ public:
   NNFW_STATUS set_config(const char *key, const char *value);
   NNFW_STATUS get_config(const char *key, char *value, size_t value_size);
   NNFW_STATUS load_circle_from_buffer(uint8_t *buffer, size_t size);
+  NNFW_STATUS load_model_from_modelfile(const char *file_path);
 
   //
   // Experimental API
@@ -154,8 +156,10 @@ private:
   State _state{State::INITIALIZED};
   std::shared_ptr<onert::ir::Subgraphs> _subgraphs;
   std::unique_ptr<onert::compiler::Compiler> _compiler;
-  std::shared_ptr<onert::exec::Execution> _execution;
+  std::unique_ptr<onert::exec::Execution> _execution;
   std::shared_ptr<onert::frontend::custom::KernelRegistry> _kernel_registry;
+
+  std::unique_ptr<onert::util::TracingCtx> _tracing_ctx;
 };
 
 #endif // __API_NNFW_API_INTERNAL_H__
diff --git a/runtime/onert/backend/CMakeLists.txt b/runtime/onert/backend/CMakeLists.txt
index 42d622a..dc038c9 100644
--- a/runtime/onert/backend/CMakeLists.txt
+++ b/runtime/onert/backend/CMakeLists.txt
@@ -4,3 +4,5 @@ add_subdirectory(cpu)
 add_subdirectory(acl_cl)
 add_subdirectory(acl_neon)
 add_subdirectory(acl_common)
+add_subdirectory(ruy)
+add_subdirectory(xnnpack)
diff --git a/runtime/onert/backend/acl_cl/Backend.h b/runtime/onert/backend/acl_cl/Backend.h
index 5c50413..4f48314 100644
--- a/runtime/onert/backend/acl_cl/Backend.h
+++ b/runtime/onert/backend/acl_cl/Backend.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <backend/Backend.h>
 
+#include "BackendContext.h"
 #include "Config.h"
 #include "ConstantInitializer.h"
 #include "KernelGenerator.h"
@@ -41,21 +42,20 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<BackendContext> newContext(const ir::Graph &graph,
-                                             const std::shared_ptr<custom::IKernelBuilder> &,
-                                             bool is_linear_executor) const override
+  std::unique_ptr<backend::BackendContext>
+  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &,
+             bool is_linear_executor) const override
   {
     const auto &operands = graph.operands();
     const auto &operations = graph.operations();
-    auto context = std::make_unique<BackendContext>(this, &graph);
+    auto context = std::make_unique<acl_cl::BackendContext>(this, &graph);
     auto tm = createTensorManager(is_linear_executor);
     auto tr = std::make_shared<acl_common::AclTensorRegistry<TensorManager>>(tm);
-    auto tb = std::make_shared<TensorBuilder>(operands, tm, tr);
+    auto tb = std::make_shared<TensorBuilder>(operands, tm);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
     context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr);
-    context->tensor_register = nullptr;
     context->optimizer = std::make_shared<Optimizer>(context.get());
     return context;
   }
diff --git a/runtime/onert/backend/acl_cl/BackendContext.cc b/runtime/onert/backend/acl_cl/BackendContext.cc
new file mode 100644
index 0000000..a6f228a
--- /dev/null
+++ b/runtime/onert/backend/acl_cl/BackendContext.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+#include "Optimizer.h"
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
+{
+  ir::OperandIndexMap<uint32_t> uses_map;
+  ir::OperandIndexMap<uint32_t> def_map;
+  ir::OperandIndexSequence constants;
+
+  // Prepare scanning
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    const auto &li = lower_info.operand.at(ind);
+    if (li->def_factors().getOnlyElement().backend() != backend())
+      continue;
+
+    // Ignore unused tensor
+    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
+    {
+      VERBOSE(planTensors) << "Operand #" << ind.value() << " will not be used. no more process."
+                           << std::endl;
+      return;
+    }
+
+    uses_map[ind] = obj.getUses().size();
+    def_map[ind] = obj.getDef().valid() ? 1 : 0;
+
+    if (obj.isConstant())
+      constants.append(ind);
+
+    auto factor = li->def_factors().getOnlyElement();
+    if (!tensor_builder->isRegistered(ind))
+    {
+      // These tensors do not exist in any op_seq (No use and def)
+      const auto info = obj.info();
+      const auto backend_layout = factor.layout();
+      // TODO Change tensor info to have permuted shape
+      tensor_builder->registerTensorInfo(ind, info, backend_layout);
+    }
+  }
+
+  // Start scanning to do notify{First|Last}Use for each tensor
+
+  // If a tensor is a constant, increase the use of the tensor and allocate it first.
+  // Increasing use count here makes the tensor never be deallocated, i.e it they will be
+  // deallocated last.
+  VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl;
+  for (const auto &ind : constants)
+  {
+    uses_map[ind]++;
+    tensor_builder->notifyFirstUse(ind);
+  }
+
+  // At each operation,
+  // 1. Scan DEF of outputs. If the DEF, allocate it
+  // 2. Scan DEF of inputs. If variable tensor, allocate it
+  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+  for (const auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    for (const auto &op_idx : op_seq.operations())
+    {
+      auto &op = graph()->operations().at(op_idx);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      // Define outputs
+      for (const auto &ind : op_outputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(def_map.find(ind) != def_map.end());
+        if (def_map[ind])
+        {
+          def_map[ind] = 0;
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      // Scan variable tensors
+      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+      // non-constant because of less memory usage by memory planning in here
+      for (const auto &ind : op_inputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        const auto &operand = graph()->operands().at(ind);
+        if (operand.info().isVariable())
+        {
+          // The variable tensor with buffer is not supported yet
+          assert(operand.data() == nullptr);
+          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
+                 lower_info.operand.at(ind)->use_factors().size() == 1);
+          assert(uses_map[ind] == 1 && def_map[ind] == 0);
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      for (const auto &ind : op_inputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0)
+        {
+          // plan for deallocation of static tensornode
+          tensor_builder->notifyLastUse(ind);
+        }
+      }
+    }
+  }
+
+  // Dispose and validate
+  for (const auto &ind : constants)
+  {
+    --uses_map[ind];
+    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  }
+
+  assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+
+  assert(
+      std::all_of(def_map.begin(), def_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  optimizer->optimize();
+
+  for (const auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                    ir::Remove::DUPLICATED;
+    for (const auto op_ind : op_seq)
+    {
+      bool op_assigned = [&]() {
+        for (auto &op_info : operation_list())
+          if (op_info.index == op_ind)
+            return true;
+        return false;
+      }();
+      if (!op_assigned)
+        continue;
+
+      const auto &op = graph()->operations().at(op_ind);
+      for (const auto &index : (op.getInputs() + op.getOutputs()) | ir::Remove::UNDEFINED)
+      {
+        if (!tensor_builder->isRegistered(index) && !model_io.contains(index) &&
+            find(operand_list().begin(), operand_list().end(), index) != operand_list().end())
+        {
+          const auto &operand_lower_info =
+              lower_info.operand.at(index)->def_factors().getOnlyElement();
+
+          // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
+          // op.getOutputs() of permute (CPU) returns tensor A
+          // but tensor A belongs to the backend of acl_cl.
+          // So, we have to make this tensor NOT registered for CPU.
+          if (operand_lower_info.backend() != backend())
+            continue;
+
+          const auto &obj = graph()->operands().at(index);
+          const auto frontend_layout = op_seq.getLayout();
+          const auto backend_layout = operand_lower_info.layout();
+          ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                       obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+          tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+        }
+      }
+    }
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    planTensors(order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  tensor_builder->allocate();
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) {
+      ifunc.prepare();
+      tensor_builder->postFunctionPrepare();
+    });
+  }
+
+  return ret;
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/acl_cl/BackendContext.h b/runtime/onert/backend/acl_cl/BackendContext.h
new file mode 100644
index 0000000..662d767
--- /dev/null
+++ b/runtime/onert/backend/acl_cl/BackendContext.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_CL_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_ACL_CL_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class Optimizer;
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}
+  {
+  }
+
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
+private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+  std::shared_ptr<Optimizer> optimizer;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_CL_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
index b45b910..413a7cc 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
@@ -112,7 +112,7 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
   const auto &axis_obj = _operands.at(axis_index);
 
   const auto ifm_rank = input_obj.shape().rank();
-  const auto frontend_layout = this->_current_op_seq_layout;
+  const auto frontend_layout = this->_current_layout;
 
   auto output_tensor = this->_tensor_reg->getITensor(output_index);
   const auto backend_layout = output_tensor->layout();
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.h b/runtime/onert/backend/acl_cl/ConstantInitializer.h
index 9f3acb4..fc0eca8 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.h
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
-#define __ONERT_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
+#ifndef __ONERT_BACKEND_ACL_CL_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_ACL_CL_CONSTANT_INITIALIZER_H__
 
 #include "AclConstantInitializer.h"
 
@@ -45,4 +45,4 @@ public:
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
+#endif // __ONERT_BACKEND_ACL_CL_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index e7690af..3a5ea5a 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -49,7 +49,7 @@ KernelGenerator::KernelGenerator(
     const std::shared_ptr<TensorBuilder> &tensor_builder,
     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
-      _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
+      _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
 {
   // DO NOTHING
 }
@@ -62,7 +62,7 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
   _return_fn_seq->enableDynamicShapeInferer(false);
 
-  _current_op_seq_layout = op_seq.getLayout();
+  _current_layout = op_seq.getLayout();
   for (const auto &operation_idx : op_seq.operations())
   {
     const auto &node = _operations_ctx.at(operation_idx);
@@ -78,6 +78,25 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
+  const auto NNApiInputs = 2;
+  if (node.getInputs().size() != NNApiInputs)
+  {
+    const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
+    if (!_ctx.at(crops_index).isConstant())
+    {
+      throw std::runtime_error("Non-constant crops NYI for acl_cl backend BatchToSpaceND");
+    }
+
+    auto crops = _ctx.at(crops_index).asVector<int32_t>();
+    for (auto crop : crops)
+    {
+      if (crop != 0)
+      {
+        throw std::runtime_error("Non-zero crops NYI for acl_cl backend BatchToSpaceND");
+      }
+    }
+  }
+
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
@@ -152,8 +171,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -189,8 +208,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   // Kernel format is [1, kernel_height, kernel_width, depth_out].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -255,7 +274,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   else
   {
     const auto rank = _ctx.at(ofm_index).shape().rank();
-    const auto frontend_layout = _current_op_seq_layout;
+    const auto frontend_layout = _current_layout;
     const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
@@ -277,7 +296,7 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
-      node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
+      node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
@@ -296,7 +315,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = input_tensor->layout();
 
   std::unique_ptr<arm_compute::IFunction> fn;
@@ -329,7 +348,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
@@ -388,7 +407,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
@@ -455,7 +474,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
@@ -557,7 +576,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = ifm_tensor->layout();
 
   const auto &perms = _ctx.at(perm_idx);
@@ -836,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
 
   const size_t output_rank = _ctx.at(output_idx).shape().rank();
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = output_tensor->layout();
   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
@@ -887,7 +906,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
@@ -923,8 +942,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 {
   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
-      node, _ctx, _tensor_reg, _current_op_seq_layout,
-      acl_common::convertPoolType(node.param().op_type));
+      node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -1169,9 +1187,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
 
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
 
   const auto stride = node.param().stride;
 
@@ -1270,7 +1288,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   UNUSED_RELEASE(backend_layout);
   assert(backend_layout == ifm_tensor->layout());
   assert(backend_layout == indices_tensor->layout());
-  assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+  assert(ifm_rank < 4 || _current_layout == backend_layout);
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
@@ -1306,11 +1324,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
-void KernelGenerator::visit(const ir::operation::ArgMax &node)
+void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
-  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
 
   auto ifm_shape = _ctx.at(ifm_index).shape();
   auto ofm_shape = _ctx.at(ofm_index).shape();
@@ -1320,7 +1338,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
-  auto frontend_layout = _current_op_seq_layout;
+  auto frontend_layout = _current_layout;
   auto backend_layout = ifm_tensor->layout();
 
   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
@@ -1331,10 +1349,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   auto acl_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
-
+  auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
+                                             : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
-      ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
-      ::arm_compute::ReductionOperation::ARG_IDX_MAX);
+      ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1400,7 +1418,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &ofm_ind : output_indexes)
     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = ifm_tensor->layout();
   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
@@ -1439,7 +1457,7 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
   {
     int32_t split_dim = split_dim_op.asScalar<int32_t>();
     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
-    const auto frontend_layout = _current_op_seq_layout;
+    const auto frontend_layout = _current_layout;
     const auto backend_layout = ifm_tensor->layout();
 
     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
@@ -1483,7 +1501,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
@@ -1526,7 +1544,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto input = _tensor_reg->getAclTensor(input_index)->handle();
   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 
   ::arm_compute::PaddingList padding_list;
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h
index e8a9226..22a7c18 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.h
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
 
-#include <backend/IKernelGenerator.h>
+#include <backend/cpu_common/KernelGeneratorBase.h>
 
 #include "ir/Operands.h"
 #include "TensorBuilder.h"
@@ -31,7 +31,7 @@ namespace backend
 namespace acl_cl
 {
 
-class KernelGenerator : public IKernelGenerator
+class KernelGenerator : public cpu_common::KernelGeneratorBase
 {
 public:
   KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
@@ -39,60 +39,61 @@ public:
                   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &_tensor_reg);
 
   void visit(const ir::OpSequence &) override;
+
+  void visit(const ir::operation::ArgMinMax &) override;
   void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::BinaryArithmetic &) override;
+  void visit(const ir::operation::Comparison &) override;
+  void visit(const ir::operation::Concat &) override;
   void visit(const ir::operation::Conv2D &) override;
+  void visit(const ir::operation::ConvertFp16ToFp32 &) override;
+  void visit(const ir::operation::ConvertFp32ToFp16 &) override;
+  void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
-  void visit(const ir::operation::Concat &) override;
-  void visit(const ir::operation::FullyConnected &) override;
-  void visit(const ir::operation::Reduce &) override;
-  void visit(const ir::operation::Reshape &) override;
-  void visit(const ir::operation::Squeeze &) override;
-  void visit(const ir::operation::Softmax &) override;
-  void visit(const ir::operation::Slice &) override;
-  void visit(const ir::operation::StridedSlice &) override;
-  void visit(const ir::operation::Transpose &) override;
   void visit(const ir::operation::ElementwiseActivation &) override;
   void visit(const ir::operation::ElementwiseBinary &) override;
   void visit(const ir::operation::ElementwiseUnary &) override;
+  void visit(const ir::operation::EmbeddingLookup &) override;
   void visit(const ir::operation::ExpandDims &) override;
+  void visit(const ir::operation::FullyConnected &) override;
+  void visit(const ir::operation::Gather &) override;
+  void visit(const ir::operation::HashtableLookup &) override;
   void visit(const ir::operation::InstanceNorm &) override;
-  void visit(const ir::operation::Comparison &) override;
+  void visit(const ir::operation::L2Normalization &) override;
+  void visit(const ir::operation::LocalResponseNormalization &) override;
   void visit(const ir::operation::LSTM &) override;
   void visit(const ir::operation::OneHot &) override;
   void visit(const ir::operation::Pack &) override;
-  void visit(const ir::operation::Pool2D &) override;
+  void visit(const ir::operation::Pad &) override;
   void visit(const ir::operation::Permute &) override;
+  void visit(const ir::operation::Pool2D &) override;
+  void visit(const ir::operation::PReLU &) override;
+  void visit(const ir::operation::Reduce &) override;
+  void visit(const ir::operation::Reshape &) override;
   void visit(const ir::operation::ResizeBilinear &) override;
   void visit(const ir::operation::ResizeNearestNeighbor &) override;
+  void visit(const ir::operation::Reverse &) override;
   void visit(const ir::operation::RNN &) override;
+  void visit(const ir::operation::Slice &) override;
+  void visit(const ir::operation::Softmax &) override;
   void visit(const ir::operation::SpaceToBatchND &) override;
   void visit(const ir::operation::SpaceToDepth &) override;
-  void visit(const ir::operation::EmbeddingLookup &) override;
-  void visit(const ir::operation::L2Normalization &) override;
-  void visit(const ir::operation::HashtableLookup &) override;
-  void visit(const ir::operation::PReLU &) override;
-  void visit(const ir::operation::TransposeConv &) override;
-  void visit(const ir::operation::SquaredDifference &) override;
-  void visit(const ir::operation::TopKV2 &) override;
-  void visit(const ir::operation::Gather &) override;
-  void visit(const ir::operation::ArgMax &) override;
-  void visit(const ir::operation::LocalResponseNormalization &) override;
-  void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::Split &) override;
   void visit(const ir::operation::SplitV &) override;
+  void visit(const ir::operation::SquaredDifference &) override;
+  void visit(const ir::operation::Squeeze &) override;
+  void visit(const ir::operation::StridedSlice &) override;
+  void visit(const ir::operation::TopKV2 &) override;
+  void visit(const ir::operation::Transpose &) override;
+  void visit(const ir::operation::TransposeConv &) override;
   void visit(const ir::operation::Unpack &) override;
-  void visit(const ir::operation::Pad &) override;
-  void visit(const ir::operation::ConvertFp32ToFp16 &) override;
-  void visit(const ir::operation::ConvertFp16ToFp32 &) override;
-  void visit(const ir::operation::Reverse &) override;
 
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
-  ir::Layout _current_op_seq_layout;
+  ir::Layout _current_layout;
 };
 
 } // namespace acl_cl
diff --git a/runtime/onert/backend/acl_cl/Optimizer.h b/runtime/onert/backend/acl_cl/Optimizer.h
index 18d38ec..ad51548 100644
--- a/runtime/onert/backend/acl_cl/Optimizer.h
+++ b/runtime/onert/backend/acl_cl/Optimizer.h
@@ -17,8 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_CL_OPTIMIZER_H__
 #define __ONERT_BACKEND_ACL_CL_OPTIMIZER_H__
 
-#include <backend/IOptimizer.h>
-#include <backend/BackendContext.h>
+#include "BackendContext.h"
 #include "TensorBuilder.h"
 
 namespace onert
@@ -28,12 +27,12 @@ namespace backend
 namespace acl_cl
 {
 
-class Optimizer : public IOptimizer
+class Optimizer
 {
 public:
   Optimizer(BackendContext *context);
 
-  void optimize() override;
+  void optimize();
 
 private:
   BackendContext *_context;
diff --git a/runtime/onert/backend/acl_cl/acl_cl.cc b/runtime/onert/backend/acl_cl/acl_cl.cc
index 88378b1..82cbde0 100644
--- a/runtime/onert/backend/acl_cl/acl_cl.cc
+++ b/runtime/onert/backend/acl_cl/acl_cl.cc
@@ -14,20 +14,11 @@
  * limitations under the License.
  */
 
-#include <util/logging.h>
-
 #include "Backend.h"
 
 extern "C" {
-onert::backend::Backend *onert_backend_create()
-{
-  VERBOSE(onert_backend_create) << "'acl_cl' loaded\n";
-  return new onert::backend::acl_cl::Backend;
-}
 
-void onert_backend_destroy(onert::backend::Backend *backend)
-{
-  VERBOSE(onert_backend_create) << "'acl_cl' unloaded\n";
-  delete backend;
-}
+onert::backend::Backend *onert_backend_create() { return new onert::backend::acl_cl::Backend; }
+
+void onert_backend_destroy(onert::backend::Backend *backend) { delete backend; }
 }
diff --git a/runtime/onert/backend/acl_common/AclConstantInitializer.cc b/runtime/onert/backend/acl_common/AclConstantInitializer.cc
index 21f41a3..921d107 100644
--- a/runtime/onert/backend/acl_common/AclConstantInitializer.cc
+++ b/runtime/onert/backend/acl_common/AclConstantInitializer.cc
@@ -25,7 +25,7 @@ namespace acl_common
 
 AclConstantInitializer::AclConstantInitializer(const ir::Operands &operands,
                                                const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : IConstantInitializer{operands}, _tensor_reg{tensor_reg}
+    : cpu_common::ConstantInitializerBase{operands}, _tensor_reg{tensor_reg}
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/backend/acl_common/AclConstantInitializer.h b/runtime/onert/backend/acl_common/AclConstantInitializer.h
index 52f4c54..894e2e7 100644
--- a/runtime/onert/backend/acl_common/AclConstantInitializer.h
+++ b/runtime/onert/backend/acl_common/AclConstantInitializer.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_COMPILER_ACL_COMMON_ACLCONSTANT_INITIALIZER_H__
 #define __ONERT_COMPILER_ACL_COMMON_ACLCONSTANT_INITIALIZER_H__
 
-#include <backend/IConstantInitializer.h>
+#include <backend/cpu_common/ConstantInitializerBase.h>
 #include <ir/Operands.h>
 #include "AclTensorRegistry.h"
 
@@ -28,7 +28,7 @@ namespace backend
 namespace acl_common
 {
 
-class AclConstantInitializer : public IConstantInitializer
+class AclConstantInitializer : public cpu_common::ConstantInitializerBase
 {
 public:
   AclConstantInitializer(const ir::Operands &operands,
diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h
index bb7abc9..12e9ab8 100644
--- a/runtime/onert/backend/acl_common/AclTensorBuilder.h
+++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h
@@ -21,7 +21,6 @@
 #include <queue>
 
 #include <arm_compute/core/Types.h>
-#include <backend/ITensorBuilder.h>
 #include "ir/OperandIndexMap.h"
 #include <ir/Operands.h>
 #include "AclTensorManager.h"
@@ -43,14 +42,12 @@ enum class UsesType
   LAST
 };
 
-template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-class AclTensorBuilder : public ITensorBuilder
+template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> class AclTensorBuilder
 {
 public:
   using T_AclTensorManager = AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>;
 
-  AclTensorBuilder(const ir::Operands &operands, T_AclTensorManager *tensor_mgr,
-                   const std::shared_ptr<AclTensorRegistry<T_AclTensorManager>> &tensor_reg);
+  AclTensorBuilder(const ir::Operands &operands, T_AclTensorManager *tensor_mgr);
 
   /**
    * @brief     Register tensor information to allocate on ACL-CL backend
@@ -59,16 +56,16 @@ public:
    * @param[in] layout Tensor data layout
    */
   void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout) override;
+                          ir::Layout backend_layout);
 
-  void notifyFirstUse(const ir::OperandIndex &) override;
-  void notifyLastUse(const ir::OperandIndex &) override;
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
 
-  bool isRegistered(const ir::OperandIndex &) const override;
+  bool isRegistered(const ir::OperandIndex &) const;
 
-  void prepare(void) override;
-  void allocate() override;
-  void postFunctionPrepare() override;
+  void prepare(void);
+  void allocate();
+  void postFunctionPrepare();
 
   T_AclTensorManager *acl_tensor_manager(void) { return _tensor_mgr.get(); }
 
@@ -105,7 +102,6 @@ private:
   ir::OperandIndexMap<size_t> _uses_count_map;
 
   std::unique_ptr<T_AclTensorManager> _tensor_mgr;
-  std::shared_ptr<AclTensorRegistry<T_AclTensorManager>> _tensor_reg;
 
   // for linear executor
   std::vector<std::pair<UsesType, ir::OperandIndex>> _lifetime_seq;
@@ -133,10 +129,9 @@ namespace acl_common
 {
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::AclTensorBuilder(
-    const ir::Operands &operands, T_AclTensorManager *tensor_mgr,
-    const std::shared_ptr<AclTensorRegistry<T_AclTensorManager>> &tensor_reg)
-    : _operands{operands}, _tensor_mgr{tensor_mgr}, _tensor_reg{tensor_reg}
+AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::AclTensorBuilder(const ir::Operands &operands,
+                                                                     T_AclTensorManager *tensor_mgr)
+    : _operands{operands}, _tensor_mgr{tensor_mgr}
 {
   assert(_tensor_mgr);
 }
diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc
index 67d9d71..7d3a690 100644
--- a/runtime/onert/backend/acl_common/Convert.cc
+++ b/runtime/onert/backend/acl_common/Convert.cc
@@ -109,13 +109,19 @@ namespace acl_common
     case ir::DataType::UINT8:
       return ::arm_compute::DataType::U8;
     case ir::DataType::QUANT_INT8_SYMM:
-      return ::arm_compute::DataType::S8;
+      return ::arm_compute::DataType::QSYMM8;
+    case ir::DataType::QUANT_INT8_ASYMM:
+      return ::arm_compute::DataType::QASYMM8_SIGNED;
     case ir::DataType::FLOAT16:
       return ::arm_compute::DataType::F16;
     case ir::DataType::INT64:
       return ::arm_compute::DataType::S64;
+    case ir::DataType::QUANT_INT16_ASYMM:
+      return ::arm_compute::DataType::QASYMM16;
+    case ir::DataType::QUANT_INT8_SYMM_PER_CHANNEL:
+      return ::arm_compute::DataType::QSYMM8_PER_CHANNEL;
     default:
-      throw std::runtime_error("Not supported, yet");
+      throw std::runtime_error("Not supported internal data type, yet");
       break;
   }
 }
@@ -175,7 +181,7 @@ namespace acl_common
       return ::arm_compute::ActivationLayerInfo{
           ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.0f, 0.0f};
     default:
-      throw std::runtime_error{"Not supported, yet"};
+      throw std::runtime_error{"Not supported internal activation, yet"};
       break;
   }
 }
@@ -219,7 +225,7 @@ asActivationLayerInfo(const ir::operation::ElementwiseActivation::Type op_type,
       return ::arm_compute::ActivationLayerInfo{
           ::arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha};
     default:
-      throw std::runtime_error{"Not supported, yet"};
+      throw std::runtime_error{"Not supported internal elementwise activation, yet"};
       break;
   }
 }
@@ -295,6 +301,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type)
       return ir::DataType::UINT32;
     case ::arm_compute::DataType::QASYMM8:
       return ir::DataType::QUANT_UINT8_ASYMM;
+    case ::arm_compute::DataType::QASYMM8_SIGNED:
+      return ir::DataType::QUANT_INT8_ASYMM;
     case ::arm_compute::DataType::U8:
       return ir::DataType::UINT8;
     case ::arm_compute::DataType::QSYMM8:
@@ -304,7 +312,7 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type)
     case ::arm_compute::DataType::S64:
       return ir::DataType::INT64;
     default:
-      throw std::runtime_error{"Not supported, yet"};
+      throw std::runtime_error{"Not supported acl data type, yet"};
       break;
   }
 }
diff --git a/runtime/onert/backend/acl_neon/Backend.h b/runtime/onert/backend/acl_neon/Backend.h
index 35d6e4e..b11c197 100644
--- a/runtime/onert/backend/acl_neon/Backend.h
+++ b/runtime/onert/backend/acl_neon/Backend.h
@@ -21,6 +21,7 @@
 #include <backend/Backend.h>
 #include <ir/Operands.h>
 
+#include "BackendContext.h"
 #include "Config.h"
 #include "ConstantInitializer.h"
 #include "KernelGenerator.h"
@@ -41,21 +42,20 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<BackendContext> newContext(const ir::Graph &graph,
-                                             const std::shared_ptr<custom::IKernelBuilder> &,
-                                             bool is_linear_executor) const override
+  std::unique_ptr<backend::BackendContext>
+  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &,
+             bool is_linear_executor) const override
   {
     const auto &operands = graph.operands();
     const auto &operations = graph.operations();
-    auto context = std::make_unique<BackendContext>(this, &graph);
+    auto context = std::make_unique<acl_neon::BackendContext>(this, &graph);
     auto tm = createTensorManager(is_linear_executor);
     auto tr = std::make_shared<acl_common::AclTensorRegistry<TensorManager>>(tm);
-    auto tb = std::make_shared<TensorBuilder>(operands, tm, tr);
+    auto tb = std::make_shared<TensorBuilder>(operands, tm);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
     context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr);
-    context->tensor_register = nullptr;
     context->optimizer = std::make_shared<Optimizer>(context.get());
     return context;
   }
diff --git a/runtime/onert/backend/acl_neon/BackendContext.cc b/runtime/onert/backend/acl_neon/BackendContext.cc
new file mode 100644
index 0000000..8b53171
--- /dev/null
+++ b/runtime/onert/backend/acl_neon/BackendContext.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+#include "Optimizer.h"
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
+{
+  ir::OperandIndexMap<uint32_t> uses_map;
+  ir::OperandIndexMap<uint32_t> def_map;
+  ir::OperandIndexSequence constants;
+
+  // Prepare scanning
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    const auto &li = lower_info.operand.at(ind);
+    if (li->def_factors().getOnlyElement().backend() != backend())
+      continue;
+
+    // Ignore unused tensor
+    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
+    {
+      VERBOSE(planTensors) << "Operand #" << ind.value() << " will not be used. no more process."
+                           << std::endl;
+      return;
+    }
+
+    uses_map[ind] = obj.getUses().size();
+    def_map[ind] = obj.getDef().valid() ? 1 : 0;
+
+    if (obj.isConstant())
+      constants.append(ind);
+
+    auto factor = li->def_factors().getOnlyElement();
+    if (!tensor_builder->isRegistered(ind))
+    {
+      // These tensors do not exist in any op_seq (No use and def)
+      const auto info = obj.info();
+      const auto backend_layout = factor.layout();
+      // TODO Change tensor info to have permuted shape
+      tensor_builder->registerTensorInfo(ind, info, backend_layout);
+    }
+  }
+
+  // Start scanning to do notify{First|Last}Use for each tensor
+
+  // If a tensor is a constant, increase the use of the tensor and allocate it first.
+  // Increasing use count here makes the tensor never be deallocated, i.e it they will be
+  // deallocated last.
+  VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl;
+  for (const auto &ind : constants)
+  {
+    uses_map[ind]++;
+    tensor_builder->notifyFirstUse(ind);
+  }
+
+  // At each operation,
+  // 1. Scan DEF of outputs. If the DEF, allocate it
+  // 2. Scan DEF of inputs. If variable tensor, allocate it
+  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+  for (const auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    for (const auto &op_idx : op_seq.operations())
+    {
+      auto &op = graph()->operations().at(op_idx);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      // Define outputs
+      for (const auto &ind : op_outputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(def_map.find(ind) != def_map.end());
+        if (def_map[ind])
+        {
+          def_map[ind] = 0;
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      // Scan variable tensors
+      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+      // non-constant because of less memory usage by memory planning in here
+      for (const auto &ind : op_inputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        const auto &operand = graph()->operands().at(ind);
+        if (operand.info().isVariable())
+        {
+          // The variable tensor with buffer is not supported yet
+          assert(operand.data() == nullptr);
+          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
+                 lower_info.operand.at(ind)->use_factors().size() == 1);
+          assert(uses_map[ind] == 1 && def_map[ind] == 0);
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      for (const auto &ind : op_inputs)
+      {
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0)
+        {
+          // plan for deallocation of static tensornode
+          tensor_builder->notifyLastUse(ind);
+        }
+      }
+    }
+  }
+
+  // Dispose and validate
+  for (const auto &ind : constants)
+  {
+    --uses_map[ind];
+    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  }
+
+  assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+
+  assert(
+      std::all_of(def_map.begin(), def_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  optimizer->optimize();
+
+  for (const auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                    ir::Remove::DUPLICATED;
+    for (const auto op_ind : op_seq)
+    {
+      bool op_assigned = [&]() {
+        for (auto &op_info : operation_list())
+          if (op_info.index == op_ind)
+            return true;
+        return false;
+      }();
+      if (!op_assigned)
+        continue;
+
+      const auto &op = graph()->operations().at(op_ind);
+      for (const auto &index : (op.getInputs() + op.getOutputs()) | ir::Remove::UNDEFINED)
+      {
+        if (!tensor_builder->isRegistered(index) && !model_io.contains(index) &&
+            find(operand_list().begin(), operand_list().end(), index) != operand_list().end())
+        {
+          const auto &operand_lower_info =
+              lower_info.operand.at(index)->def_factors().getOnlyElement();
+
+          // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
+          // op.getOutputs() of permute (CPU) returns tensor A
+          // but tensor A belongs to the backend of acl_cl.
+          // So, we have to make this tensor NOT registered for CPU.
+          if (operand_lower_info.backend() != backend())
+            continue;
+
+          const auto &obj = graph()->operands().at(index);
+          const auto frontend_layout = op_seq.getLayout();
+          const auto backend_layout = operand_lower_info.layout();
+          ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                       obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+          tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+        }
+      }
+    }
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    planTensors(order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  tensor_builder->allocate();
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) {
+      ifunc.prepare();
+      tensor_builder->postFunctionPrepare();
+    });
+  }
+
+  return ret;
+}
+
+} // namespace neon
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/acl_neon/BackendContext.h b/runtime/onert/backend/acl_neon/BackendContext.h
new file mode 100644
index 0000000..dd764c0
--- /dev/null
+++ b/runtime/onert/backend/acl_neon/BackendContext.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_NEON_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_ACL_NEON_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class Optimizer;
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}
+  {
+  }
+
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
+private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  // TODO Make it private
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+  std::shared_ptr<Optimizer> optimizer;
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_NEON_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/acl_neon/ConstantInitializer.h b/runtime/onert/backend/acl_neon/ConstantInitializer.h
index c7d71cd..9723ba0 100644
--- a/runtime/onert/backend/acl_neon/ConstantInitializer.h
+++ b/runtime/onert/backend/acl_neon/ConstantInitializer.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
-#define __ONERT_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
+#ifndef __ONERT_BACKEND_ACL_NEON_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_ACL_NEON_CONSTANT_INITIALIZER_H__
 
 #include "AclConstantInitializer.h"
 
@@ -41,4 +41,4 @@ public:
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
+#endif // __ONERT_BACKEND_ACL_NEON_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index ffaee3b..e712dfa 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -48,7 +48,7 @@ KernelGenerator::KernelGenerator(
     const std::shared_ptr<TensorBuilder> &tensor_builder,
     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
-      _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
+      _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
 {
   // DO NOTHING
 }
@@ -61,7 +61,7 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
   _return_fn_seq->enableDynamicShapeInferer(false);
 
-  _current_op_seq_layout = op_seq.getLayout();
+  _current_layout = op_seq.getLayout();
   for (const auto &operation_idx : op_seq.operations())
   {
     const auto &node = _operations_ctx.at(operation_idx);
@@ -70,17 +70,17 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
   }
 }
 
-void KernelGenerator::visit(const ir::operation::ArgMax &node)
+void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
-  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
-  auto frontend_layout = _current_op_seq_layout;
+  auto frontend_layout = _current_layout;
   auto backend_layout = ifm_tensor->layout();
 
   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
@@ -91,10 +91,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   assert(axis_value >= 0 && axis_value < ifm_rank);
   const auto fixed_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+  auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
+                                             : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
 
   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
-      ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
-      arm_compute::ReductionOperation::ARG_IDX_MAX);
+      ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -106,6 +107,25 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
+  const auto NNApiInputs = 2;
+  if (node.getInputs().size() != NNApiInputs)
+  {
+    const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
+    if (!_ctx.at(crops_index).isConstant())
+    {
+      throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
+    }
+
+    auto crops = _ctx.at(crops_index).asVector<int32_t>();
+    for (auto crop : crops)
+    {
+      if (crop != 0)
+      {
+        throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
+      }
+    }
+  }
+
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
@@ -178,8 +198,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -232,8 +252,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   // Kernel format is [1, kernel_height, kernel_width, depth_out].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -297,7 +317,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   else
   {
     const auto rank = _ctx.at(ofm_index).shape().rank();
-    const auto frontend_layout = _current_op_seq_layout;
+    const auto frontend_layout = _current_layout;
     const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
@@ -495,7 +515,7 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
-      node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
+      node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
@@ -552,7 +572,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
   assert(backend_layout == ifm_tensor->layout());
   assert(backend_layout == indices_tensor->layout());
-  assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+  assert(ifm_rank < 4 || _current_layout == backend_layout);
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
@@ -686,7 +706,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
@@ -738,7 +758,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   {
     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 
-    const auto frontend_layout = _current_op_seq_layout;
+    const auto frontend_layout = _current_layout;
     const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
     const auto axis =
         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
@@ -762,8 +782,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 {
   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
-      node, _ctx, _tensor_reg, _current_op_seq_layout,
-      acl_common::convertPoolType(node.param().op_type));
+      node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -836,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = input_tensor->layout();
   const auto reduce_axes =
       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
@@ -873,7 +892,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
@@ -1047,7 +1066,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &ofm_ind : output_indexes)
     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = ifm_tensor->layout();
   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
@@ -1085,7 +1104,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
@@ -1150,7 +1169,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
@@ -1244,9 +1263,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
 
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
 
   const auto stride = node.param().stride;
 
@@ -1285,7 +1304,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
   const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = ifm_tensor->layout();
   const auto rank = _ctx.at(ifm_idx).shape().rank();
 
@@ -1340,7 +1359,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
@@ -1413,7 +1432,7 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
 
   const size_t output_rank = _ctx.at(out_idx).shape().rank();
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto backend_layout = output_tensor->layout();
   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.h b/runtime/onert/backend/acl_neon/KernelGenerator.h
index 4d269cd..2a4b307 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.h
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
 
-#include <backend/IKernelGenerator.h>
+#include <backend/cpu_common/KernelGeneratorBase.h>
 
 #include "ir/Operands.h"
 #include "TensorBuilder.h"
@@ -31,7 +31,7 @@ namespace backend
 namespace acl_neon
 {
 
-class KernelGenerator : public IKernelGenerator
+class KernelGenerator : public cpu_common::KernelGeneratorBase
 {
 public:
   KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
@@ -39,17 +39,20 @@ public:
                   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &_tensor_reg);
 
   void visit(const ir::OpSequence &) override;
-  void visit(const ir::operation::ArgMax &) override;
+
+  void visit(const ir::operation::ArgMinMax &) override;
   void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::BinaryArithmetic &) override;
+  void visit(const ir::operation::Comparison &) override;
+  void visit(const ir::operation::Concat &) override;
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
-  void visit(const ir::operation::Concat &) override;
   void visit(const ir::operation::ElementwiseActivation &) override;
   void visit(const ir::operation::ElementwiseBinary &) override;
   void visit(const ir::operation::ElementwiseUnary &) override;
   void visit(const ir::operation::EmbeddingLookup &) override;
+  void visit(const ir::operation::ExpandDims &) override;
   void visit(const ir::operation::FullyConnected &) override;
   void visit(const ir::operation::Gather &) override;
   void visit(const ir::operation::HashtableLookup &) override;
@@ -57,36 +60,34 @@ public:
   void visit(const ir::operation::L2Normalization &) override;
   void visit(const ir::operation::LocalResponseNormalization &) override;
   void visit(const ir::operation::LSTM &) override;
+  void visit(const ir::operation::OneHot &) override;
   void visit(const ir::operation::Pack &) override;
   void visit(const ir::operation::Pad &) override;
-  void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::Permute &) override;
+  void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::PReLU &) override;
   void visit(const ir::operation::Reduce &) override;
   void visit(const ir::operation::Reshape &) override;
   void visit(const ir::operation::ResizeBilinear &) override;
   void visit(const ir::operation::RNN &) override;
-  void visit(const ir::operation::Squeeze &) override;
+  void visit(const ir::operation::Slice &) override;
   void visit(const ir::operation::Softmax &) override;
   void visit(const ir::operation::SpaceToBatchND &) override;
   void visit(const ir::operation::SpaceToDepth &) override;
   void visit(const ir::operation::Split &) override;
   void visit(const ir::operation::SquaredDifference &) override;
-  void visit(const ir::operation::Slice &) override;
+  void visit(const ir::operation::Squeeze &) override;
   void visit(const ir::operation::StridedSlice &) override;
-  void visit(const ir::operation::TransposeConv &) override;
   void visit(const ir::operation::Transpose &) override;
+  void visit(const ir::operation::TransposeConv &) override;
   void visit(const ir::operation::Unpack &) override;
-  void visit(const ir::operation::ExpandDims &) override;
-  void visit(const ir::operation::Comparison &) override;
-  void visit(const ir::operation::OneHot &) override;
 
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
-  ir::Layout _current_op_seq_layout;
+  ir::Layout _current_layout;
 };
 
 } // namespace acl_neon
diff --git a/runtime/onert/backend/acl_neon/Optimizer.h b/runtime/onert/backend/acl_neon/Optimizer.h
index 5fe0d51..b8fb343 100644
--- a/runtime/onert/backend/acl_neon/Optimizer.h
+++ b/runtime/onert/backend/acl_neon/Optimizer.h
@@ -17,8 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_NEON_OPTIMIZER_H__
 #define __ONERT_BACKEND_ACL_NEON_OPTIMIZER_H__
 
-#include <backend/IOptimizer.h>
-#include <backend/BackendContext.h>
+#include "BackendContext.h"
 #include "TensorBuilder.h"
 
 namespace onert
@@ -28,12 +27,12 @@ namespace backend
 namespace acl_neon
 {
 
-class Optimizer : public IOptimizer
+class Optimizer
 {
 public:
   Optimizer(BackendContext *context);
 
-  void optimize() override;
+  void optimize();
 
 private:
   BackendContext *_context;
diff --git a/runtime/onert/backend/acl_neon/acl_neon.cc b/runtime/onert/backend/acl_neon/acl_neon.cc
index f490d13..6535fb2 100644
--- a/runtime/onert/backend/acl_neon/acl_neon.cc
+++ b/runtime/onert/backend/acl_neon/acl_neon.cc
@@ -14,20 +14,11 @@
  * limitations under the License.
  */
 
-#include <util/logging.h>
-
 #include "Backend.h"
 
 extern "C" {
-onert::backend::Backend *onert_backend_create()
-{
-  VERBOSE(onert_backend_create) << "'acl_neon' loaded\n";
-  return new onert::backend::acl_neon::Backend;
-}
 
-void onert_backend_destroy(onert::backend::Backend *backend)
-{
-  VERBOSE(onert_backend_create) << "'acl_neon' unloaded\n";
-  delete backend;
-}
+onert::backend::Backend *onert_backend_create() { return new onert::backend::acl_neon::Backend; }
+
+void onert_backend_destroy(onert::backend::Backend *backend) { delete backend; }
 }
diff --git a/runtime/onert/backend/cpu/Backend.h b/runtime/onert/backend/cpu/Backend.h
index fc8574b..0b416a7 100644
--- a/runtime/onert/backend/cpu/Backend.h
+++ b/runtime/onert/backend/cpu/Backend.h
@@ -54,8 +54,6 @@ public:
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
     context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
                                                             context->external_context());
-    context->tensor_register = nullptr;
-    context->optimizer = nullptr;
     return context;
   }
 
diff --git a/runtime/onert/backend/cpu/BackendContext.cc b/runtime/onert/backend/cpu/BackendContext.cc
new file mode 100644
index 0000000..6b958c1
--- /dev/null
+++ b/runtime/onert/backend/cpu/BackendContext.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+#include "backend/cpu_common/BackendContextHelpers.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                  ir::Remove::DUPLICATED;
+  for (auto index : operand_list())
+  {
+    if (model_io.contains(index))
+      continue;
+    const auto &obj = graph()->operands().at(index);
+    const auto frontend_layout = [&]() {
+      if (obj.getUses().size() == 0)
+        return ir::Layout::UNKNOWN;
+      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
+      for (auto &operation_info : operation_list())
+      {
+        if (operation_info.index == use_op_ind)
+          return operation_info.layout;
+      }
+      return ir::Layout::UNKNOWN;
+    }();
+    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
+    if (permute_factor.backend() != backend())
+      continue;
+    const auto backend_layout = permute_factor.layout();
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    cpu_common::planTensors(*this, order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return ret;
+}
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/BackendContext.h b/runtime/onert/backend/cpu/BackendContext.h
index e90b210..0a4106d 100644
--- a/runtime/onert/backend/cpu/BackendContext.h
+++ b/runtime/onert/backend/cpu/BackendContext.h
@@ -18,6 +18,9 @@
 #define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
 
 #include <backend/BackendContext.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
 #include "ExternalContext.h"
 
 namespace onert
@@ -32,21 +35,35 @@ class BackendContext : public onert::backend::BackendContext
 public:
   BackendContext(const Backend *backend, const ir::Graph *graph,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
-                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
-                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
-                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
-                 std::shared_ptr<IOptimizer> optimizer = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry, tensor_builder,
-                                       constant_initializer, kernel_gen, tensor_register,
-                                       optimizer),
-        _external_context(new ExternalContext)
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}, _external_context(new ExternalContext)
   {
   }
 
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
 private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  // TODO Make it private
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+
+private:
   // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
   //      the thread pool is also created in duplicate
   // TODO Create one ruy context for session
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
index c016c83..d7858c0 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.h
+++ b/runtime/onert/backend/cpu/ConstantInitializer.h
@@ -14,13 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_COMPILER_CPU_CONSTANT_INITIALIZER_H__
-#define __ONERT_COMPILER_CPU_CONSTANT_INITIALIZER_H__
+#ifndef __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
 
-#include "backend/cpu_common/TensorRegistry.h"
-
-#include <backend/IConstantInitializer.h>
-#include <ir/Operands.h>
+#include <backend/cpu_common/ConstantInitializer.h>
 
 namespace onert
 {
@@ -29,35 +26,10 @@ namespace backend
 namespace cpu
 {
 
-class ConstantInitializer : public IConstantInitializer
-{
-public:
-  ConstantInitializer(const ir::Operands &operands,
-                      const std::shared_ptr<ITensorRegistry> &tensor_reg);
-
-public:
-  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
-
-  // TODO: For now the only cpu backend supports constant tensor to use data from external
-  // If the other backend supports (to do this,
-  // ExternalTensor should be abstract such as IExternal, maybe),
-  // this can be an interface of IConstantInitializer
-  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
-
-public:
-  void visit(const ir::operation::Conv2D &) override;
-  void visit(const ir::operation::DepthwiseConv2D &) override;
-  void visit(const ir::operation::FullyConnected &) override;
-
-private:
-  std::shared_ptr<ITensorRegistry> tensor_registry() const override { return _tensor_reg; }
-
-private:
-  std::shared_ptr<ITensorRegistry> _tensor_reg;
-};
+using ConstantInitializer = cpu_common::ConstantInitializer;
 
 } // namespace cpu
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_COMPILER_CPU_CONSTANT_INITIALIZER_H__
+#endif // __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index 32e249f..f5d11f4 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -17,7 +17,6 @@
 #ifndef __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__
 #define __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__
 
-#include <backend/IExternalContext.h>
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
 
@@ -33,7 +32,7 @@ namespace backend
 namespace cpu
 {
 
-class ExternalContext : public IExternalContext
+class ExternalContext
 {
 public:
   ExternalContext() : _ruy_context(new ruy::Context)
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 451815b..25756ec 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -23,6 +23,7 @@
 #include "ops/CompareLayer.h"
 #include "ops/ConcatLayer.h"
 #include "ops/ConvolutionLayer.h"
+#include "ops/DepthToSpaceLayer.h"
 #include "ops/DepthwiseConvolutionLayer.h"
 #include "ops/EinsumLayer.h"
 #include "ops/ElementwiseActivationLayer.h"
@@ -108,12 +109,16 @@ convertElementwiseActivationType(ir::operation::ElementwiseActivation::Type type
 {
   switch (type_ir)
   {
+    case ir::operation::ElementwiseActivation::Type::ELU:
+      return ops::ElementwiseActivationType::kElu;
     case ir::operation::ElementwiseActivation::Type::LOGISTIC:
       return ops::ElementwiseActivationType::kLogistic;
     case ir::operation::ElementwiseActivation::Type::RELU:
       return ops::ElementwiseActivationType::kReLU;
     case ir::operation::ElementwiseActivation::Type::TANH:
       return ops::ElementwiseActivationType::kTanh;
+    case ir::operation::ElementwiseActivation::Type::LEAKY_RELU:
+      return ops::ElementwiseActivationType::kLeakyReLU;
     default:
       throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
   }
@@ -124,6 +129,8 @@ convertElementwiseBinaryType(ir::operation::ElementwiseBinary::ElementwiseBinary
 {
   switch (type_ir)
   {
+    case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
+      return ops::ElementwiseBinaryType::kLogicalAnd;
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
       return ops::ElementwiseBinaryType::kLogicalOr;
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
@@ -167,6 +174,10 @@ ops::ElementwiseUnaryType convertElementwiseUnaryType(ir::operation::Elementwise
       return ops::ElementwiseUnaryType::kRSqrt;
     case ir::operation::ElementwiseUnary::Type::SIN:
       return ops::ElementwiseUnaryType::kSin;
+    case ir::operation::ElementwiseUnary::Type::SQRT:
+      return ops::ElementwiseUnaryType::kSqrt;
+    case ir::operation::ElementwiseUnary::Type::SQUARE:
+      return ops::ElementwiseUnaryType::kSquare;
     case ir::operation::ElementwiseUnary::Type::ZEROS_LIKE:
       return ops::ElementwiseUnaryType::kZerosLike;
     default:
@@ -217,7 +228,7 @@ KernelGenerator::KernelGenerator(
     const std::shared_ptr<ExternalContext> &external_context)
     : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
       _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
-      _current_op_seq_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
 {
   // DO NOTHING
 }
@@ -260,7 +271,7 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
     _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
   }
 
-  _current_op_seq_layout = op_seq.getLayout();
+  _current_layout = op_seq.getLayout();
   for (const auto &operation_idx : op_seq.operations())
   {
     const auto &node = _operations_ctx.at(operation_idx);
@@ -314,8 +325,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
     _return_fn = std::move(fn);
     return;
   }
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -342,8 +353,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 
   const auto stride = node.param().stride;
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   // Kernel format is [1, kernel_height, kernel_width, depth_out].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -364,7 +375,7 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 
   fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
                 padding.bottom, stride.horizontal, stride.vertical, multiplier, dilation_width,
-                dilation_height, activation, ofm_tensor);
+                dilation_height, activation, ofm_tensor, _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -374,7 +385,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   const auto ofm_index{node.getOutputs().at(0)};
 
   const auto rank = _ctx.at(ofm_index).shape().rank();
-  const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
+  const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
 
   auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
@@ -418,16 +429,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 void KernelGenerator::visit(const ir::operation::Fill &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
+  // SHAPE input is used for shape inference
   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
 
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
   auto value_tensor = _tensor_reg->getPortableTensor(value_index);
 
   auto fn = std::make_unique<ops::FillLayer>();
 
-  fn->configure(input_tensor, value_tensor, output_tensor);
+  fn->configure(value_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -576,7 +586,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   assert(backend_layout == indices_tensor->layout());
   const auto &input_shape = _ctx.at(input_index).shape();
   UNUSED_RELEASE(input_shape);
-  assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
+  assert(input_shape.rank() < 4 || _current_layout == backend_layout);
 
   const auto axis_raw = node.param().axis;
   const auto axis_value = (axis_raw < 0 ? (input_shape.rank() + axis_raw) : axis_raw);
@@ -640,7 +650,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
     for (auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
-      // TODO make sure using `_current_op_seq_layout` is correct for custom operations
+      // TODO make sure using `_current_layout` is correct for custom operations
       types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
       auto in_tensor = _tensor_reg->getPortableTensor(idx);
       tensors.emplace_back(in_tensor);
@@ -713,15 +723,14 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
-  const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
+  // AXIS input is used for output shape inference
 
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
   auto input_tensor = _tensor_reg->getPortableTensor(input_index);
-  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ExpandDimsLayer>();
 
-  fn->configure(input_tensor, axis_tensor, output_tensor);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -731,7 +740,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   const auto ofm_index{node.getOutputs().at(0)};
 
   const auto rank = _ctx.at(ofm_index).shape().rank();
-  const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
+  const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
 
   assert(-rank <= axis && axis < rank);
 
@@ -753,7 +762,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   const auto input_index{node.getInputs().at(0)};
 
   const auto rank = _ctx.at(input_index).shape().rank();
-  const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
+  const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
 
   assert(rank == 0 || (-rank <= axis && axis < rank));
 
@@ -1004,11 +1013,11 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   _return_fn = std::move(fn);
 }
 
-void KernelGenerator::visit(const ir::operation::ArgMax &node)
+void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::ArgMax::INPUT)};
-  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::AXIS)};
+  const auto input_index{node.getInputs().at(ir::operation::ArgMinMax::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::AXIS)};
 
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
   auto input_tensor = _tensor_reg->getPortableTensor(input_index);
@@ -1016,7 +1025,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
 
-  fn->configure(input_tensor, output_tensor, axis_tensor, /* is_arg_max */ true);
+  fn->configure(input_tensor, output_tensor, axis_tensor, node.param().is_arg_max);
 
   _return_fn = std::move(fn);
 }
@@ -1029,8 +1038,8 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
   const auto kh = node.param().kh;
   const auto kw = node.param().kw;
   const auto stride = node.param().stride;
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   const auto padding =
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
@@ -1255,6 +1264,21 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   _return_fn = std::move(fn);
 }
 
+void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+{
+  const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+  auto block_size = node.param().block_size;
+
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+
+  auto fn = std::make_unique<ops::DepthToSpaceLayer>();
+
+  fn->configure(input_tensor, block_size, output_tensor);
+  _return_fn = std::move(fn);
+}
+
 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 {
   const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index 5df7760..3a4cfbf 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -23,7 +23,7 @@
 #include "Tensor.h"
 
 #include <backend/CustomKernelBuilder.h>
-#include <backend/IKernelGenerator.h>
+#include <backend/cpu_common/KernelGeneratorBase.h>
 #include <ir/Operands.h>
 #include <ir/Operations.h>
 
@@ -34,7 +34,7 @@ namespace backend
 namespace cpu
 {
 
-class KernelGenerator : public IKernelGenerator
+class KernelGenerator : public cpu_common::KernelGeneratorBase
 {
 public:
   KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
@@ -43,59 +43,59 @@ public:
                   const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
                   const std::shared_ptr<ExternalContext> &external_context);
 
-  using IKernelGenerator::visit;
+  void visit(const ir::OpSequence &) override;
 
   void visit(const ir::operation::AddN &) override;
-  void visit(const ir::OpSequence &) override;
+  void visit(const ir::operation::ArgMinMax &) override;
+  void visit(const ir::operation::BatchMatMul &) override;
+  void visit(const ir::operation::BatchToSpaceND &) override;
+  void visit(const ir::operation::BinaryArithmetic &) override;
+  void visit(const ir::operation::BroadcastTo &) override;
+  void visit(const ir::operation::Comparison &) override;
+  void visit(const ir::operation::Concat &) override;
   void visit(const ir::operation::Conv2D &) override;
+  void visit(const ir::operation::Custom &node) override;
+  void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
-  void visit(const ir::operation::Concat &) override;
-  void visit(const ir::operation::Fill &) override;
-  void visit(const ir::operation::FullyConnected &) override;
-  void visit(const ir::operation::Reshape &) override;
-  void visit(const ir::operation::Squeeze &) override;
-  void visit(const ir::operation::Softmax &) override;
-  void visit(const ir::operation::Comparison &) override;
-  void visit(const ir::operation::BinaryArithmetic &) override;
   void visit(const ir::operation::Einsum &) override;
-  void visit(const ir::operation::Gather &) override;
-  void visit(const ir::operation::Custom &node) override;
   void visit(const ir::operation::ElementwiseActivation &) override;
   void visit(const ir::operation::ElementwiseBinary &) override;
   void visit(const ir::operation::ElementwiseUnary &) override;
   void visit(const ir::operation::ExpandDims &) override;
+  void visit(const ir::operation::Fill &) override;
+  void visit(const ir::operation::FullyConnected &) override;
+  void visit(const ir::operation::FusedBatchNorm &) override;
+  void visit(const ir::operation::Gather &) override;
+  void visit(const ir::operation::L2Normalization &) override;
+  void visit(const ir::operation::LogSoftmax &) override;
   void visit(const ir::operation::LSTM &) override;
-  void visit(const ir::operation::Pad &) override;
-  void visit(const ir::operation::Pack &) override;
-  void visit(const ir::operation::Unpack &) override;
+  void visit(const ir::operation::MatrixBandPart &) override;
   void visit(const ir::operation::OneHot &) override;
-  void visit(const ir::operation::Transpose &) override;
-  void visit(const ir::operation::Reduce &) override;
-  void visit(const ir::operation::Select &) override;
-  void visit(const ir::operation::Slice &) override;
-  void visit(const ir::operation::StridedSlice &) override;
-  void visit(const ir::operation::Split &) override;
-  void visit(const ir::operation::Shape &) override;
-  void visit(const ir::operation::ResizeBilinear &node) override;
-  void visit(const ir::operation::Reverse &) override;
-  void visit(const ir::operation::ArgMax &) override;
+  void visit(const ir::operation::Pack &) override;
+  void visit(const ir::operation::Pad &) override;
   void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::Pow &) override;
-  void visit(const ir::operation::SquaredDifference &) override;
-  void visit(const ir::operation::Tile &) override;
-  void visit(const ir::operation::L2Normalization &) override;
   void visit(const ir::operation::Range &) override;
   void visit(const ir::operation::Rank &) override;
-  void visit(const ir::operation::MatrixBandPart &) override;
-  void visit(const ir::operation::BatchMatMul &) override;
-  void visit(const ir::operation::BatchToSpaceND &) override;
-  void visit(const ir::operation::BroadcastTo &) override;
-  void visit(const ir::operation::FusedBatchNorm &) override;
-  void visit(const ir::operation::LogSoftmax &) override;
+  void visit(const ir::operation::Reduce &) override;
+  void visit(const ir::operation::Reshape &) override;
+  void visit(const ir::operation::ResizeBilinear &node) override;
+  void visit(const ir::operation::Reverse &) override;
+  void visit(const ir::operation::Select &) override;
+  void visit(const ir::operation::Shape &) override;
+  void visit(const ir::operation::Slice &) override;
+  void visit(const ir::operation::Softmax &) override;
   void visit(const ir::operation::SpaceToBatchND &) override;
   void visit(const ir::operation::SpaceToDepth &) override;
-  void visit(const ir::operation::StatelessRandomUniform &) override;
+  void visit(const ir::operation::Split &) override;
   void visit(const ir::operation::SplitV &) override;
+  void visit(const ir::operation::SquaredDifference &) override;
+  void visit(const ir::operation::Squeeze &) override;
+  void visit(const ir::operation::StatelessRandomUniform &) override;
+  void visit(const ir::operation::StridedSlice &) override;
+  void visit(const ir::operation::Tile &) override;
+  void visit(const ir::operation::Transpose &) override;
+  void visit(const ir::operation::Unpack &) override;
 
 private:
   const ir::Operands &_ctx;
@@ -103,7 +103,7 @@ private:
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
   std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
-  ir::Layout _current_op_seq_layout;
+  ir::Layout _current_layout;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
deleted file mode 100644
index 3edac89..0000000
--- a/runtime/onert/backend/cpu/StaticTensorManager.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "StaticTensorManager.h"
-#include "Tensor.h"
-
-#include <util/logging.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu
-{
-
-StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg,
-                                         cpu_common::DynamicTensorManager *dynamic_tensor_manager)
-    : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg},
-      _dynamic_tensor_manager{dynamic_tensor_manager}
-{
-  // DO NOTHING
-}
-
-void StaticTensorManager::allocateNonconsts(void)
-{
-  _nonconst_mgr->allocate();
-
-  for (auto &pair : _tensors->native_tensors())
-  {
-    const auto &ind = pair.first;
-    auto tensor = pair.second.get();
-    if (!_as_constants[ind] && !tensor->is_dynamic())
-    {
-      auto *buffer = _nonconst_mgr->getBuffer(ind);
-      tensor->setBuffer(buffer);
-
-      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer) << std::endl;
-    }
-  }
-}
-
-void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
-
-void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
-                                      const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
-                                      bool as_const)
-{
-  assert(!_tensors->getITensor(ind));
-  if (as_const)
-  {
-    auto tensor = std::make_unique<ExternalTensor>(tensor_info, backend_layout);
-    _tensors->setNativeTensor(ind, std::move(tensor));
-  }
-  else
-  {
-    auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout,
-                                           _dynamic_tensor_manager->dynamic_mem_mgr().get());
-    _tensors->setNativeTensor(ind, std::move(tensor));
-  }
-  _as_constants[ind] = as_const;
-}
-
-void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
-{
-  assert(_tensors->getITensor(ind));
-
-  // This method is called only when a tensor has proper shape
-  assert(!_tensors->getITensor(ind)->is_dynamic());
-
-  if (!_as_constants[ind])
-    _nonconst_mgr->claimPlan(ind, size);
-}
-
-void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
-{
-  assert(_tensors->getITensor(ind));
-
-  // This method is called only when a tensor has proper shape
-  assert(!_tensors->getITensor(ind)->is_dynamic());
-
-  if (!_as_constants[ind])
-    _nonconst_mgr->releasePlan(ind);
-}
-
-void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
-{
-  for (const auto &it : _tensors->native_tensors())
-    fn(it.first);
-}
-
-} // namespace cpu
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
index 2af61e4..d07f0c8 100644
--- a/runtime/onert/backend/cpu/StaticTensorManager.h
+++ b/runtime/onert/backend/cpu/StaticTensorManager.h
@@ -17,13 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
 #define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
 
-#include "backend/IStaticTensorManager.h"
-#include "backend/cpu_common/DynamicTensorManager.h"
-#include "backend/cpu_common/MemoryManager.h"
-#include "backend/cpu_common/TensorRegistry.h"
-#include "backend/ITensorManager.h"
-#include "ir/OperandIndexMap.h"
-#include "ir/OperandInfo.h"
+#include "backend/cpu_common/StaticTensorManager.h"
 
 namespace onert
 {
@@ -32,30 +26,7 @@ namespace backend
 namespace cpu
 {
 
-class StaticTensorManager : public backend::IStaticTensorManager
-{
-public:
-  StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg,
-                      cpu_common::DynamicTensorManager *dynamic_tensor_manager);
-  virtual ~StaticTensorManager() = default;
-
-  void allocateNonconsts(void);
-  void deallocateNonconsts(void);
-
-  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
-                   ir::Layout backend_layout, bool as_const);
-
-  void claimPlan(const ir::OperandIndex &ind, uint32_t size);
-  void releasePlan(const ir::OperandIndex &ind);
-
-  void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
-
-private:
-  std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
-  const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
-  ir::OperandIndexMap<bool> _as_constants;
-  cpu_common::DynamicTensorManager *_dynamic_tensor_manager;
-};
+using StaticTensorManager = cpu_common::StaticTensorManager;
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 2ad2ad0..d663c3f 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -28,92 +28,7 @@ namespace cpu
 {
 
 using Tensor = cpu_common::Tensor;
-
-/**
- * @brief Class that uses data from external memory that is not managed by a backend
- *        instead of allocating and copying the data. ExternalTensor's data pointer points to
- *        an address of memory such as where memory is already allocated, or mmapped area.
- *        This is meaning that ExternalTensor can take all of types' ir::Data.
- *        To support this, assume below things no padding, always NHWC layout,
- *        constant tensor and not dynamic.
- */
-class ExternalTensor : public Tensor
-{
-public:
-  ExternalTensor() = delete;
-  virtual ~ExternalTensor();
-
-public:
-  ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
-      : Tensor(info, layout, nullptr)
-  {
-    assert(_layout == ir::Layout::NHWC);
-    assert(_info.isConstant());
-    assert(_info.isDynamic() == false);
-  }
-
-public:
-  /**
-   * @brief     set Data to be shared from external so that this ExternalTensor will not be
-   *            allocated on CPU backend
-   * @param[in] data    data of Operand to be set
-   */
-  void setData(const std::shared_ptr<ir::Data> data)
-  {
-    assert(data != nullptr);
-    _data = data;
-    // Note. Some op such as cker::Conv could take buffer as nullptr.
-    // That's why _buffer also would be used
-    _buffer = const_cast<uint8_t *>(_data->base());
-  }
-
-public:
-  uint8_t *buffer() const override { return _buffer; }
-
-  bool is_constant() const override { return true; }
-  bool is_dynamic() const override { return false; }
-  void set_dynamic() override
-  {
-    throw std::runtime_error("This tensor does not support changing dynamic");
-  }
-
-  void setShape(const ir::Shape &) override
-  {
-    throw std::runtime_error("This tensor does not support changing shape");
-  }
-
-  void increase_ref() override { ++_num_references; }
-
-  void decrease_ref() override
-  {
-    assert(_data != nullptr);
-    assert(_num_references > 0);
-    --_num_references;
-    if (_num_references == 0)
-    {
-      _data.reset();
-      _buffer = nullptr;
-    }
-  }
-
-  /**
-   * @brief Reset reference count to zero and release data
-   */
-  void reset_ref() override
-  {
-    assert(_data != nullptr);
-    assert(_num_references > 0);
-    _num_references = 0;
-
-    _data.reset();
-    _buffer = nullptr;
-  }
-
-  int32_t num_references() override { return _num_references; }
-
-private:
-  std::shared_ptr<const ir::Data> _data;
-};
+using ExternalTensor = cpu_common::ExternalTensor;
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index 448abc2..9d8a5de 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -20,7 +20,6 @@
 #include <backend/cpu_common/DynamicTensorManager.h>
 #include <backend/cpu_common/TensorRegistry.h>
 
-#include <backend/ITensorBuilder.h>
 #include <ir/OperandIndexMap.h>
 
 #include "StaticTensorManager.h"
@@ -35,7 +34,7 @@ namespace backend
 namespace cpu
 {
 
-class TensorBuilder : public ITensorBuilder
+class TensorBuilder
 {
 public:
   TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
@@ -47,18 +46,18 @@ public:
    * @param[in] layout Operand data layout
    */
   void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout) override;
+                          ir::Layout backend_layout);
 
-  void notifyFirstUse(const ir::OperandIndex &) override;
-  void notifyLastUse(const ir::OperandIndex &) override;
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
 
-  bool isRegistered(const ir::OperandIndex &) const override;
+  bool isRegistered(const ir::OperandIndex &) const;
 
-  void prepare(void) override;
-  void allocate() override;
-  void postFunctionPrepare() override { /* DO NOTHING */}
+  void prepare(void);
+  void allocate();
+  void postFunctionPrepare() { /* DO NOTHING */}
 
-  IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); }
+  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
 
 private:
   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
diff --git a/runtime/onert/backend/cpu/cpu.cc b/runtime/onert/backend/cpu/cpu.cc
index 5385bb2..55538e2 100644
--- a/runtime/onert/backend/cpu/cpu.cc
+++ b/runtime/onert/backend/cpu/cpu.cc
@@ -16,18 +16,9 @@
 
 #include "Backend.h"
 
-#include <util/logging.h>
-
 extern "C" {
-onert::backend::Backend *onert_backend_create()
-{
-  VERBOSE(onert_backend_create) << "'cpu' loaded\n";
-  return new onert::backend::cpu::Backend;
-}
 
-void onert_backend_destroy(onert::backend::Backend *backend)
-{
-  VERBOSE(onert_backend_create) << "'cpu' unloaded\n";
-  delete backend;
-}
+onert::backend::Backend *onert_backend_create() { return new onert::backend::cpu::Backend; }
+
+void onert_backend_destroy(onert::backend::Backend *backend) { delete backend; }
 }
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
index 2fd284c..d5ffdef 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
@@ -79,6 +79,9 @@ void ArgMinMaxLayer::run()
       case ir::DataType::UINT8:
         TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
         break;
+      case ir::DataType::QUANT_INT8_ASYMM:
+        TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
+        break;
       case ir::DataType::INT32:
         TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int32_t);
         break;
@@ -97,6 +100,9 @@ void ArgMinMaxLayer::run()
       case ir::DataType::UINT8:
         TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t);
         break;
+      case ir::DataType::QUANT_INT8_ASYMM:
+        TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t);
+        break;
       case ir::DataType::INT32:
         TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int64_t);
         break;
diff --git a/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc b/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
index 7ef0237..ba96559 100644
--- a/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
@@ -67,7 +67,7 @@ void BatchMatMulLayer::configure(const IPortableTensor *lhs, const IPortableTens
 
 void BatchMatMulLayer::run()
 {
-  if (_lhs->data_type() == OperandType::FLOAT32)
+  if ((_lhs->data_type() == OperandType::FLOAT32) && (_rhs->data_type() == OperandType::FLOAT32))
   {
     batchMatMulFloat32();
   }
diff --git a/runtime/onert/backend/cpu/ops/ConcatLayer.cc b/runtime/onert/backend/cpu/ops/ConcatLayer.cc
index d26ed73..edfdfc1 100644
--- a/runtime/onert/backend/cpu/ops/ConcatLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ConcatLayer.cc
@@ -117,24 +117,26 @@ void ConcatLayer::configure(const std::vector<const IPortableTensor *> &inputs,
 
 void ConcatLayer::run()
 {
-  if (_output->data_type() == OperandType::FLOAT32)
+  switch (_output->data_type())
   {
-    concatenationGeneral<float>();
+    case OperandType::FLOAT32:
+      concatenationGeneral<float>();
+      break;
+    case OperandType::QUANT_UINT8_ASYMM:
+      concatenationQuant8();
+      break;
+    case OperandType::QUANT_INT8_ASYMM:
+      concatenationGeneral<int8_t>();
+      break;
+    case OperandType::INT32:
+      concatenationGeneral<int32_t>();
+      break;
+    case OperandType::INT64:
+      concatenationGeneral<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Concat: unsupported data type");
   }
-  else if (_output->data_type() == OperandType::QUANT_UINT8_ASYMM)
-  {
-    concatenationQuant8();
-  }
-  else if (_output->data_type() == OperandType::INT32)
-  {
-    concatenationGeneral<int32_t>();
-  }
-  else if (_output->data_type() == OperandType::INT64)
-  {
-    concatenationGeneral<int64_t>();
-  }
-  else
-    throw std::runtime_error("Concat: unsupported data type");
 }
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
index 799e9e2..c964e38 100644
--- a/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
@@ -203,8 +203,6 @@ void ConvolutionLayer::prepare()
   _prepare = true;
 }
 
-#undef ANDROID_NN_CONV_PARAMETERS
-
 } // namespace ops
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc
new file mode 100644
index 0000000..d265d0a
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthToSpaceLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/DepthToSpace.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+DepthToSpaceLayer::DepthToSpaceLayer() : _input(nullptr), _block_size(0), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename T> void DepthToSpaceLayer::depthToSpace()
+{
+  nnfw::cker::DepthToSpace(getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()),
+                           getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()),
+                           _block_size);
+}
+
+void DepthToSpaceLayer::configure(const IPortableTensor *input, const int32_t block_size,
+                                  IPortableTensor *output)
+{
+  _input = input;
+  _block_size = block_size;
+  _output = output;
+}
+
+void DepthToSpaceLayer::run()
+{
+  switch (_input->data_type())
+  {
+    case OperandType::FLOAT32:
+      depthToSpace<float>();
+      break;
+    case OperandType::INT32:
+      depthToSpace<int32_t>();
+      break;
+    case OperandType::INT64:
+      depthToSpace<int64_t>();
+      break;
+    case OperandType::QUANT_UINT8_ASYMM:
+      depthToSpace<uint8_t>();
+      break;
+    case OperandType::QUANT_INT8_ASYMM:
+      depthToSpace<int8_t>();
+      break;
+    default:
+      throw std::runtime_error{"DepthToSpace: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.h b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.h
new file mode 100644
index 0000000..32e0171
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_DEPTH_TO_SPACE_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_DEPTH_TO_SPACE_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class DepthToSpaceLayer : public ::onert::exec::IFunction
+{
+public:
+  DepthToSpaceLayer();
+
+  void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  template <typename T> void depthToSpace();
+
+  const IPortableTensor *_input;
+  int32_t _block_size;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_DEPTH_TO_SPACE_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
index f1dc110..85553d1 100644
--- a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
+++ b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
@@ -43,11 +43,12 @@ void DepthwiseConvolutionLayer::convFloat32()
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
 
-  nnfw::cker::DepthwiseConv(
+  nnfw::cker::DepthwiseConv<float, float>(
       op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
       getTensorShape(_kernel), reinterpret_cast<const float *>(_kernel->buffer()),
       getTensorShape(_bias), reinterpret_cast<const float *>(_bias->buffer()),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
+      _external_context->ruy_context());
 }
 
 void DepthwiseConvolutionLayer::convQuant8()
@@ -79,11 +80,12 @@ void DepthwiseConvolutionLayer::convQuant8()
   op_params.quantized_activation_min = output_activation_min;
   op_params.quantized_activation_max = output_activation_max;
 
-  nnfw::cker::DepthwiseConv(
+  nnfw::cker::DepthwiseConv<uint8_t, int32_t>(
       op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
       getTensorShape(_kernel), reinterpret_cast<const uint8_t *>(_kernel->buffer()),
       getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias->buffer()),
-      getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+      getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()),
+      _external_context->ruy_context());
 }
 
 void DepthwiseConvolutionLayer::configure(
@@ -91,7 +93,8 @@ void DepthwiseConvolutionLayer::configure(
     const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
     const uint32_t paddingBottom, const uint32_t strideWidth, const uint32_t strideHeight,
     const uint32_t multiplier, const uint32_t dilationWidth, const uint32_t dilationHeight,
-    const ir::Activation activation, IPortableTensor *output)
+    const ir::Activation activation, IPortableTensor *output,
+    const std::shared_ptr<ExternalContext> &external_context)
 {
   _input = input;
   _kernel = kernel;
@@ -107,6 +110,7 @@ void DepthwiseConvolutionLayer::configure(
   _dilationHeight = dilationHeight;
   _activation = activation;
   _output = output;
+  _external_context = external_context;
 }
 
 void DepthwiseConvolutionLayer::run()
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
index fb032ec..fe1fcc1 100644
--- a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
+++ b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
@@ -19,6 +19,7 @@
 
 #include <backend/IPortableTensor.h>
 #include "OperationUtils.h"
+#include "../ExternalContext.h"
 
 #include <exec/IFunction.h>
 
@@ -47,7 +48,7 @@ public:
                  const uint32_t paddingBottom, const uint32_t strideW, const uint32_t strideH,
                  const uint32_t multiplier, const uint32_t dilationWidth,
                  const uint32_t dilationHeight, const ir::Activation activation,
-                 IPortableTensor *output);
+                 IPortableTensor *output, const std::shared_ptr<ExternalContext> &external_context);
 
   void run() override;
 
@@ -71,6 +72,8 @@ private:
   uint32_t _dilationHeight{1};
 
   ir::Activation _activation{ir::Activation::NONE};
+
+  std::shared_ptr<ExternalContext> _external_context;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
index c1d6317..3e1da5e 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
@@ -18,6 +18,8 @@
 
 #include "OperationUtils.h"
 
+#include <cker/operation/ELU.h>
+#include <cker/operation/LeakyReLU.h>
 #include <cker/operation/Logistic.h>
 #include <cker/operation/ReLU.h>
 #include <cker/operation/ReLU6.h>
@@ -91,6 +93,19 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
 
   switch (op_type)
   {
+    case ElementwiseActivationType::kElu:
+      if (input->data_type() == OperandType::FLOAT32)
+      {
+        _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
+          nnfw::cker::ELU(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
+                          getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+        };
+      }
+      else
+      {
+        throw std::runtime_error{"ElementwiseActivationLayer(Elu): unsupported data type"};
+      }
+      break;
     case ElementwiseActivationType::kLogistic:
       if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
       {
@@ -160,6 +175,21 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
         throw std::runtime_error{"ElementwiseActivationLayer(Logistic): unsupported data type"};
       }
       break;
+    case ElementwiseActivationType::kLeakyReLU:
+      if (_input->data_type() == OperandType::FLOAT32)
+      {
+        _kernel = [alpha](const IPortableTensor *input, IPortableTensor *output) {
+          nnfw::cker::LeakyReLU(nnfw::cker::LeakyReluParams{alpha}, getTensorShape(input),
+                                reinterpret_cast<const float *>(input->buffer()),
+                                getTensorShape(output),
+                                reinterpret_cast<float *>(output->buffer()));
+        };
+      }
+      else
+      {
+        throw std::runtime_error{"ElementwiseActivationLayer(LeakyReLU): unsupported data type"};
+      }
+      break;
     default:
       throw std::runtime_error("ElementwiseActivationLayer: unsupported op type");
   }
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.h b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.h
index 3ef5800..948ab3b 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.h
+++ b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.h
@@ -32,9 +32,11 @@ namespace ops
 
 enum class ElementwiseActivationType
 {
+  kElu,
   kLogistic,
   kReLU,
-  kTanh
+  kTanh,
+  kLeakyReLU
 };
 
 class ElementwiseActivationLayer : public ::onert::exec::IFunction
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
index ea3c1e7..1e17a08 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
@@ -18,6 +18,7 @@
 
 #include "OperationUtils.h"
 
+#include <cker/operation/LogicalAnd.h>
 #include <cker/operation/LogicalOr.h>
 #include <cker/operation/MaxMin.h>
 
@@ -33,6 +34,25 @@ namespace ops
 namespace
 {
 template <typename T>
+void logicalAndGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
+                       IPortableTensor *output)
+{
+  if (!HaveSameShapes(lhs, rhs))
+  {
+    nnfw::cker::LogicalAndBroadcast<T>(
+        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), getTensorShape(rhs),
+        reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
+        reinterpret_cast<T *>(output->buffer()));
+  }
+  else
+  {
+    nnfw::cker::LogicalAndElementwise<T>(
+        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+        reinterpret_cast<const T *>(rhs->buffer()), reinterpret_cast<T *>(output->buffer()));
+  }
+}
+
+template <typename T>
 void logicalOrGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
                       IPortableTensor *output)
 {
@@ -88,6 +108,16 @@ void ElementwiseBinaryLayer::configure(const IPortableTensor *lhs, const IPortab
 
   switch (op_type)
   {
+    case ElementwiseBinaryType::kLogicalAnd:
+      if ((_lhs->data_type() == OperandType::BOOL8) && (_rhs->data_type() == OperandType::BOOL8))
+      {
+        _kernel = logicalAndGeneric<bool>;
+      }
+      else
+      {
+        throw std::runtime_error{"LogicalOr: Unsupported data type"};
+      }
+      break;
     case ElementwiseBinaryType::kLogicalOr:
       if ((_lhs->data_type() == OperandType::BOOL8) && (_rhs->data_type() == OperandType::BOOL8))
       {
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
index 066455e..15d7f30 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
@@ -195,6 +195,18 @@ void sinFloat32(const IPortableTensor *input, IPortableTensor *output)
                   getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
 }
 
+void sqrtFloat32(const IPortableTensor *input, IPortableTensor *output)
+{
+  nnfw::cker::Sqrt(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
+                   getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+}
+
+void squareFloat32(const IPortableTensor *input, IPortableTensor *output)
+{
+  nnfw::cker::Square(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
+                     getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+}
+
 template <typename T> void zerosLikeFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
   if (!HaveSameShapes(input, output))
@@ -363,6 +375,26 @@ void ElementwiseUnaryLayer::configure(const IPortableTensor *input, IPortableTen
         throw std::runtime_error{"Sin: Unsupported  data type"};
       }
       break;
+    case ElementwiseUnaryType::kSqrt:
+      if ((input->data_type() == OperandType::FLOAT32))
+      {
+        _kernel = sqrtFloat32;
+      }
+      else
+      {
+        throw std::runtime_error{"Sqrt: Unsupported  data type"};
+      }
+      break;
+    case ElementwiseUnaryType::kSquare:
+      if ((input->data_type() == OperandType::FLOAT32))
+      {
+        _kernel = squareFloat32;
+      }
+      else
+      {
+        throw std::runtime_error{"Square: Unsupported  data type"};
+      }
+      break;
     case ElementwiseUnaryType::kZerosLike:
       if (input->data_type() == OperandType::FLOAT32)
       {
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.h b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.h
index c1765b5..54a6fc0 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.h
+++ b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.h
@@ -46,6 +46,8 @@ enum class ElementwiseUnaryType
   kRound,
   kRSqrt,
   kSin,
+  kSqrt,
+  kSquare,
   kZerosLike
 };
 
diff --git a/runtime/onert/backend/cpu/ops/ExpandDimsLayer.cc b/runtime/onert/backend/cpu/ops/ExpandDimsLayer.cc
index b545e67..5ea0ea8 100644
--- a/runtime/onert/backend/cpu/ops/ExpandDimsLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ExpandDimsLayer.cc
@@ -25,22 +25,19 @@ namespace cpu
 namespace ops
 {
 
-ExpandDimsLayer::ExpandDimsLayer() : _input(nullptr), _axis(nullptr), _output(nullptr)
+ExpandDimsLayer::ExpandDimsLayer() : _input(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
 
-void ExpandDimsLayer::configure(const IPortableTensor *input, const IPortableTensor *axis,
-                                IPortableTensor *output)
+void ExpandDimsLayer::configure(const IPortableTensor *input, IPortableTensor *output)
 {
   _input = input;
-  _axis = axis;
   _output = output;
 }
 
 void ExpandDimsLayer::run()
 {
-  // TODO use _axis to calculate shape of output when _axis is not constant
   size_t count = _input->total_size();
   memcpy(_output->buffer(), _input->buffer(), count);
 }
diff --git a/runtime/onert/backend/cpu/ops/ExpandDimsLayer.h b/runtime/onert/backend/cpu/ops/ExpandDimsLayer.h
index b5d4938..1b7ead0 100644
--- a/runtime/onert/backend/cpu/ops/ExpandDimsLayer.h
+++ b/runtime/onert/backend/cpu/ops/ExpandDimsLayer.h
@@ -36,14 +36,12 @@ public:
   ExpandDimsLayer();
 
 public:
-  void configure(const IPortableTensor *input, const IPortableTensor *axis,
-                 IPortableTensor *output);
+  void configure(const IPortableTensor *input, IPortableTensor *output);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
-  const IPortableTensor *_axis;
   IPortableTensor *_output;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/FillLayer.cc b/runtime/onert/backend/cpu/ops/FillLayer.cc
index df3f8b7..5b7c179 100644
--- a/runtime/onert/backend/cpu/ops/FillLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FillLayer.cc
@@ -29,15 +29,13 @@ namespace cpu
 namespace ops
 {
 
-FillLayer::FillLayer() : _input(nullptr), _value(nullptr), _output(nullptr)
+FillLayer::FillLayer() : _value(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
 
-void FillLayer::configure(const IPortableTensor *input, const IPortableTensor *value,
-                          IPortableTensor *output)
+void FillLayer::configure(const IPortableTensor *value, IPortableTensor *output)
 {
-  _input = input;
   _value = value;
   _output = output;
 }
@@ -47,28 +45,24 @@ void FillLayer::run()
   switch (_output->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::Fill<float *>(getTensorShape(_input), reinterpret_cast<int *>(_input->buffer()),
-                                reinterpret_cast<float *>(_value->buffer()),
+      nnfw::cker::Fill<float *>(reinterpret_cast<float *>(_value->buffer()),
                                 getTensorShape(_output),
                                 reinterpret_cast<float *>(_output->buffer()));
       break;
     case OperandType::INT32:
-      nnfw::cker::Fill<int32_t *>(getTensorShape(_input), reinterpret_cast<int *>(_input->buffer()),
-                                  reinterpret_cast<int32_t *>(_value->buffer()),
+      nnfw::cker::Fill<int32_t *>(reinterpret_cast<int32_t *>(_value->buffer()),
                                   getTensorShape(_output),
                                   reinterpret_cast<int32_t *>(_output->buffer()));
       break;
     case OperandType::INT64:
-      nnfw::cker::Fill<int64_t *>(getTensorShape(_input), reinterpret_cast<int *>(_input->buffer()),
-                                  reinterpret_cast<int64_t *>(_value->buffer()),
+      nnfw::cker::Fill<int64_t *>(reinterpret_cast<int64_t *>(_value->buffer()),
                                   getTensorShape(_output),
                                   reinterpret_cast<int64_t *>(_output->buffer()));
       break;
     case OperandType::UINT32:
-      nnfw::cker::Fill<uint32_t *>(
-          getTensorShape(_input), reinterpret_cast<int *>(_input->buffer()),
-          reinterpret_cast<uint32_t *>(_value->buffer()), getTensorShape(_output),
-          reinterpret_cast<uint32_t *>(_output->buffer()));
+      nnfw::cker::Fill<uint32_t *>(reinterpret_cast<uint32_t *>(_value->buffer()),
+                                   getTensorShape(_output),
+                                   reinterpret_cast<uint32_t *>(_output->buffer()));
       break;
     default:
       throw std::runtime_error{"Fill: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/FillLayer.h b/runtime/onert/backend/cpu/ops/FillLayer.h
index 1f17d6b..ce84365 100644
--- a/runtime/onert/backend/cpu/ops/FillLayer.h
+++ b/runtime/onert/backend/cpu/ops/FillLayer.h
@@ -35,13 +35,11 @@ class FillLayer : public ::onert::exec::IFunction
 public:
   FillLayer();
 
-  void configure(const IPortableTensor *input, const IPortableTensor *value,
-                 IPortableTensor *output);
+  void configure(const IPortableTensor *value, IPortableTensor *output);
 
   void run() override;
 
 private:
-  const IPortableTensor *_input;
   const IPortableTensor *_value;
   IPortableTensor *_output;
 };
diff --git a/runtime/onert/backend/cpu/ops/MeanLayer.cc b/runtime/onert/backend/cpu/ops/MeanLayer.cc
index 4921ac7..f130692 100644
--- a/runtime/onert/backend/cpu/ops/MeanLayer.cc
+++ b/runtime/onert/backend/cpu/ops/MeanLayer.cc
@@ -36,9 +36,24 @@ MeanLayer::MeanLayer() : _input(nullptr), _axes(nullptr), _output(nullptr), _kee
 
 void MeanLayer::MeanFloat32()
 {
-  nnfw::cker::Mean(getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-                   getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
-                   getReducerAxes(_axes));
+  const auto inputShape = getTensorShape(_input);
+  const auto axisVec = getReducerAxes(_axes);
+  bool axis_is_1_and_2 =
+      _keep_dims && inputShape.DimensionsCount() == 4 && axisVec.size() == 2 &&
+      ((axisVec[0] == 1 && axisVec[1] == 2) || (axisVec[0] == 2 && axisVec[1] == 1));
+
+  if (axis_is_1_and_2)
+  {
+    nnfw::cker::MeanAxis1And2(inputShape, reinterpret_cast<const float *>(_input->buffer()),
+                              getTensorShape(_output),
+                              reinterpret_cast<float *>(_output->buffer()));
+  }
+  else
+  {
+    nnfw::cker::Mean(inputShape, reinterpret_cast<const float *>(_input->buffer()),
+                     getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
+                     axisVec);
+  }
 }
 
 void MeanLayer::MeanQuant8()
@@ -57,6 +72,10 @@ void MeanLayer::configure(const IPortableTensor *input, const IPortableTensor *a
   _axes = axes;
   _output = output;
   _keep_dims = keep_dims;
+
+  if (_input->data_type() != OperandType::FLOAT32 &&
+      _input->data_type() != OperandType::QUANT_UINT8_ASYMM)
+    throw std::runtime_error{"Mean: unsupported data type"};
 }
 
 void MeanLayer::run()
diff --git a/runtime/onert/backend/ruy/Backend.h b/runtime/onert/backend/ruy/Backend.h
new file mode 100644
index 0000000..bc8a024
--- /dev/null
+++ b/runtime/onert/backend/ruy/Backend.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_BACKEND_H__
+#define __ONERT_BACKEND_RUY_BACKEND_H__
+
+#include "BackendContext.h"
+#include "Config.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+
+#include <backend/Backend.h>
+
+#include <memory>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class Backend : public ::onert::backend::Backend
+{
+public:
+  Backend() : _config{std::make_shared<Config>()} {}
+
+  std::shared_ptr<IConfig> config() const override { return _config; }
+
+  std::unique_ptr<onert::backend::BackendContext>
+  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
+             bool) const override
+  {
+    const auto &operands = graph.operands();
+    const auto &operations = graph.operations();
+    auto context = std::make_unique<BackendContext>(this, &graph);
+    auto tr = std::make_shared<cpu_common::TensorRegistry>();
+    auto tb = std::make_shared<TensorBuilder>(tr);
+    context->tensor_registry = tr;
+    context->tensor_builder = tb;
+    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
+    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
+                                                            context->external_context());
+    return context;
+  }
+
+private:
+  std::shared_ptr<IConfig> _config;
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_BACKEND_H__
diff --git a/runtime/onert/backend/ruy/BackendContext.cc b/runtime/onert/backend/ruy/BackendContext.cc
new file mode 100644
index 0000000..ef686f4
--- /dev/null
+++ b/runtime/onert/backend/ruy/BackendContext.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+#include "backend/cpu_common/BackendContextHelpers.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                  ir::Remove::DUPLICATED;
+  for (auto index : operand_list())
+  {
+    if (model_io.contains(index))
+      continue;
+    const auto &obj = graph()->operands().at(index);
+    const auto frontend_layout = [&]() {
+      if (obj.getUses().size() == 0)
+        return ir::Layout::UNKNOWN;
+      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
+      for (auto &operation_info : operation_list())
+      {
+        if (operation_info.index == use_op_ind)
+          return operation_info.layout;
+      }
+      return ir::Layout::UNKNOWN;
+    }();
+    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
+    if (permute_factor.backend() != backend())
+      continue;
+    const auto backend_layout = permute_factor.layout();
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    cpu_common::planTensors(*this, order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return ret;
+}
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/BackendContext.h b/runtime/onert/backend/ruy/BackendContext.h
new file mode 100644
index 0000000..b965c9a
--- /dev/null
+++ b/runtime/onert/backend/ruy/BackendContext.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_RUY_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+#include "ExternalContext.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}, _external_context(new ExternalContext)
+  {
+  }
+
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+
+  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
+  std::shared_ptr<ExternalContext> external_context() { return _external_context; }
+
+private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  // TODO Make it private
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+
+private:
+  // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
+  //      the thread pool is also created in duplicate
+  // TODO Create one ruy context for session
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/ruy/CMakeLists.txt b/runtime/onert/backend/ruy/CMakeLists.txt
new file mode 100644
index 0000000..206acbf
--- /dev/null
+++ b/runtime/onert/backend/ruy/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(LIB_ONERT_BACKEND_RUY onert_backend_ruy)
+
+nnfw_find_package(Ruy REQUIRED)
+
+file(GLOB_RECURSE SOURCES "*.cc")
+
+add_library(${LIB_ONERT_BACKEND_RUY} SHARED ${SOURCES})
+
+target_link_libraries(${LIB_ONERT_BACKEND_RUY} PRIVATE nnfw_lib_ruy)
+target_link_libraries(${LIB_ONERT_BACKEND_RUY} PRIVATE onert_core)
+target_link_libraries(${LIB_ONERT_BACKEND_RUY} PRIVATE nnfw_common)
+target_link_libraries(${LIB_ONERT_BACKEND_RUY} PRIVATE nnfw_coverage)
+target_link_libraries(${LIB_ONERT_BACKEND_RUY} PRIVATE ruy)
+
+set_target_properties(${LIB_ONERT_BACKEND_RUY} PROPERTIES OUTPUT_NAME backend_ruy)
+
+if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
+  add_custom_command(TARGET ${LIB_ONERT_BACKEND_RUY} POST_BUILD
+                     COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:${LIB_ONERT_BACKEND_RUY}>)
+endif()
+
+install(TARGETS ${LIB_ONERT_BACKEND_RUY} DESTINATION lib)
diff --git a/runtime/onert/backend/cpu/Tensor.cc b/runtime/onert/backend/ruy/Config.cc
similarity index 79%
rename from runtime/onert/backend/cpu/Tensor.cc
rename to runtime/onert/backend/ruy/Config.cc
index dac8f89..179caa9 100644
--- a/runtime/onert/backend/cpu/Tensor.cc
+++ b/runtime/onert/backend/ruy/Config.cc
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include "Tensor.h"
+#include "Config.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace cpu
+namespace ruy
 {
 
-// `dynamic_cast` not working across library boundaries on NDK
-// With this as a key function, `dynamic_cast` works across dl
-ExternalTensor::~ExternalTensor() {}
+bool Config::initialize() { return true; }
+
+ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; }
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/ruy/Config.h b/runtime/onert/backend/ruy/Config.h
new file mode 100644
index 0000000..9160dd5
--- /dev/null
+++ b/runtime/onert/backend/ruy/Config.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_CONFIG_H__
+#define __ONERT_BACKEND_RUY_CONFIG_H__
+
+#include <backend/IConfig.h>
+#include <memory>
+#include <util/ITimer.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class Config : public IConfig
+{
+public:
+  std::string id() override { return "ruy"; }
+  bool initialize() override;
+  ir::Layout supportLayout(const ir::Operation &node, ir::Layout frontend_layout) override;
+  bool supportPermutation() override { return true; }
+  bool supportDynamicTensor() override { return true; }
+  bool supportFP16() override { return false; }
+
+  std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); }
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_CONFIG_H__
diff --git a/runtime/onert/backend/ruy/ConstantInitializer.h b/runtime/onert/backend/ruy/ConstantInitializer.h
new file mode 100644
index 0000000..24b4d92
--- /dev/null
+++ b/runtime/onert/backend/ruy/ConstantInitializer.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
+
+#include <backend/cpu_common/ConstantInitializer.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+using ConstantInitializer = cpu_common::ConstantInitializer;
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/ruy/ExternalContext.h b/runtime/onert/backend/ruy/ExternalContext.h
new file mode 100644
index 0000000..f51facc
--- /dev/null
+++ b/runtime/onert/backend/ruy/ExternalContext.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_RUY_EXTERNAL_CONTEXT_H__
+
+#include <util/ConfigSource.h>
+#include <ruy/context.h>
+
+namespace
+{
+const int kDefaultNumThreadpoolThreads = 4;
+}
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class ExternalContext
+{
+public:
+  ExternalContext() : _ruy_context(new ::ruy::Context)
+  {
+    setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
+  }
+
+  void setMaxNumThreads(int max_num_threads)
+  {
+    const int target_num_threads =
+        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+    _ruy_context->set_max_num_threads(target_num_threads);
+  }
+
+  ::ruy::Context *ruy_context() const { return _ruy_context.get(); }
+
+private:
+  const std::unique_ptr<::ruy::Context> _ruy_context;
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/backend/ruy/KernelGenerator.cc b/runtime/onert/backend/ruy/KernelGenerator.cc
new file mode 100644
index 0000000..cd28250
--- /dev/null
+++ b/runtime/onert/backend/ruy/KernelGenerator.cc
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "KernelGenerator.h"
+
+#include "ops/ConvolutionLayer.h"
+#include "ops/FullyConnectedLayer.h"
+
+#include <backend/Backend.h>
+#include <backend/IConfig.h>
+#include <memory>
+#include <util/Utils.h>
+#include <util/logging.h>
+#include <exec/DynamicShapeInferer.h>
+
+#include <stdexcept>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+KernelGenerator::KernelGenerator(
+    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
+    const std::shared_ptr<TensorBuilder> &tensor_builder,
+    const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+    const std::shared_ptr<ExternalContext> &external_context)
+    : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
+      _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
+      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+{
+  // DO NOTHING
+}
+
+void KernelGenerator::visit(const ir::OpSequence &op_seq)
+{
+  assert(!_return_fn_seq);
+  assert(_tensor_builder->dynamicTensorManager());
+  assert(_tensor_reg);
+
+  auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
+
+  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
+
+  // Prepare to handle dynamic tensors later
+  auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
+  {
+    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->operations = &_operations_ctx;
+    dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
+    dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
+
+    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+  }
+
+  _current_layout = op_seq.getLayout();
+  for (const auto &operation_idx : op_seq.operations())
+  {
+    const auto &node = _operations_ctx.at(operation_idx);
+    node.accept(*this);
+    _return_fn_seq->append(releaseFunction());
+
+    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+    {
+      auto portable_tensor = _tensor_reg->getPortableTensor(ind);
+      if (portable_tensor)
+      {
+        assert(portable_tensor->layout() == ir::Layout::NHWC);
+      }
+
+      auto tensor = _tensor_reg->getNativeTensor(ind);
+      if (tensor)
+      {
+        tensor->increase_ref();
+      }
+    }
+  }
+}
+
+void KernelGenerator::visit(const ir::operation::Conv2D &node)
+{
+  using ir::operation::Conv2D;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
+
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
+
+  const auto stride = node.param().stride;
+  const auto activation = node.param().activation;
+  const auto param_padding = node.param().padding;
+  const auto dilation = node.param().dilation;
+  auto fn = std::make_unique<ops::ConvolutionLayer>();
+
+  if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
+  {
+    fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
+                  param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
+                  stride.horizontal, stride.vertical, dilation.width_factor, dilation.height_factor,
+                  activation, ofm_tensor, _external_context);
+
+    _return_fn = std::move(fn);
+    return;
+  }
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto padding =
+      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                           dilation.width_factor, dilation.height_factor);
+
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+                dilation.width_factor, dilation.height_factor, activation, ofm_tensor,
+                _external_context);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+  const auto activation = node.param().activation;
+  const auto weights_format = node.param().weights_format;
+
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
+  auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
+
+  auto fn = std::make_unique<ops::FullyConnectedLayer>();
+
+  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, weights_format, output_tensor,
+                _external_context);
+
+  _return_fn = std::move(fn);
+}
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/KernelGenerator.h b/runtime/onert/backend/ruy/KernelGenerator.h
new file mode 100644
index 0000000..0f6bd59
--- /dev/null
+++ b/runtime/onert/backend/ruy/KernelGenerator.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_KERNEL_GENERATOR_H__
+#define __ONERT_BACKEND_RUY_KERNEL_GENERATOR_H__
+
+#include "ExternalContext.h"
+#include "TensorBuilder.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "Tensor.h"
+
+#include <backend/CustomKernelBuilder.h>
+#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <ir/Operands.h>
+#include <ir/Operations.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class KernelGenerator : public cpu_common::KernelGeneratorBase
+{
+public:
+  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
+                  const std::shared_ptr<TensorBuilder> &tensor_builder,
+                  const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+                  const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
+                  const std::shared_ptr<ExternalContext> &external_context);
+
+  void visit(const ir::OpSequence &) override;
+  void visit(const ir::operation::Conv2D &) override;
+  void visit(const ir::operation::FullyConnected &) override;
+
+private:
+  const ir::Operands &_ctx;
+  const ir::Operations &_operations_ctx;
+  std::shared_ptr<TensorBuilder> _tensor_builder;
+  std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
+  ir::Layout _current_layout;
+  const std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_KERNEL_GENERATOR_H__
diff --git a/runtime/onert/backend/ruy/StaticTensorManager.h b/runtime/onert/backend/ruy/StaticTensorManager.h
new file mode 100644
index 0000000..af2d252
--- /dev/null
+++ b/runtime/onert/backend/ruy/StaticTensorManager.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_RUY_STATICTENSOR_MANAGER_H__
+
+#include "backend/cpu_common/StaticTensorManager.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+using StaticTensorManager = cpu_common::StaticTensorManager;
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/ruy/Tensor.h b/runtime/onert/backend/ruy/Tensor.h
new file mode 100644
index 0000000..60d0fbf
--- /dev/null
+++ b/runtime/onert/backend/ruy/Tensor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_TENSOR_H__
+#define __ONERT_BACKEND_RUY_TENSOR_H__
+
+#include <backend/cpu_common/Tensor.h>
+#include <ir/Data.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+using Tensor = cpu_common::Tensor;
+using ExternalTensor = cpu_common::ExternalTensor;
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_TENSOR_H__
diff --git a/runtime/onert/backend/ruy/TensorBuilder.cc b/runtime/onert/backend/ruy/TensorBuilder.cc
new file mode 100644
index 0000000..c77defc
--- /dev/null
+++ b/runtime/onert/backend/ruy/TensorBuilder.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorBuilder.h"
+
+#include <util/logging.h>
+
+#include <cassert>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+TensorBuilder::TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg)
+    : _tensor_reg{tensor_reg},
+      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
+      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
+{
+  /* empty */
+}
+
+void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+                                       ir::Layout layout)
+{
+  _tensor_info_map.emplace(ind, info);
+
+  // CPU backend supports only one layout as NHWC
+  assert(layout == ir::Layout::NHWC);
+  if (info.isDynamic())
+  {
+    _dynamic_tensor_mgr->buildTensor(ind, info, layout);
+  }
+  else
+  {
+    _static_tensor_mgr->buildTensor(ind, info, layout, info.isConstant());
+  }
+}
+
+void TensorBuilder::notifyFirstUse(const ir::OperandIndex &ind)
+{
+  assert(_tensor_info_map.find(ind) != _tensor_info_map.end());
+  const auto tensor_info = _tensor_info_map.at(ind);
+
+  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
+  {
+    const auto size = tensor_info.total_size();
+    _static_tensor_mgr->claimPlan(ind, size);
+  }
+}
+
+void TensorBuilder::notifyLastUse(const ir::OperandIndex &ind)
+{
+  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
+  {
+    _static_tensor_mgr->releasePlan(ind);
+  }
+}
+
+bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
+{
+  return _tensor_info_map.find(ind) != _tensor_info_map.end();
+}
+
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
+
+void TensorBuilder::allocate()
+{
+  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
+  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
+}
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/TensorBuilder.h b/runtime/onert/backend/ruy/TensorBuilder.h
new file mode 100644
index 0000000..91c07bd
--- /dev/null
+++ b/runtime/onert/backend/ruy/TensorBuilder.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_RUY_TENSOR_BUILDER_H__
+
+#include <backend/cpu_common/DynamicTensorManager.h>
+#include <backend/cpu_common/TensorRegistry.h>
+
+#include <ir/OperandIndexMap.h>
+
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
+#include <unordered_map>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+
+class TensorBuilder
+{
+public:
+  TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
+
+  /**
+   * @brief     Register tensor information to allocate on CPU backend
+   * @param[in] ind    Operand index
+   * @param[in] info   Operand information
+   * @param[in] layout Operand data layout
+   */
+  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+                          ir::Layout backend_layout);
+
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
+
+  bool isRegistered(const ir::OperandIndex &) const;
+
+  void prepare(void);
+  void allocate();
+  void postFunctionPrepare() { /* DO NOTHING */}
+
+  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
+
+private:
+  const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
+  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
+  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+};
+
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_TENSOR_BUILDER_H__
diff --git a/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc b/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc
new file mode 100644
index 0000000..d249b2c
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvolutionLayer.h"
+
+#include "../Tensor.h"
+#include "ir/Padding.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+ConvolutionLayer::ConvolutionLayer()
+    : _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+      _paddingType(ir::PaddingType::EXPLICIT), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
+      _paddingBottom(0), _strideWidth(0), _strideHeight(0), _dilationWidthFactor(1),
+      _dilationHeightFactor(1), _activation(ir::Activation::NONE),
+      _conv_kernel(new nnfw::ruy::Conv()), _prepare(false)
+{
+  // DO NOTHING
+}
+
+ConvolutionLayer::~ConvolutionLayer() = default;
+
+void ConvolutionLayer::convFloat32()
+{
+  float output_activation_min = 0, output_activation_max = 0;
+  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
+
+  nnfw::ruy::ConvParams op_params;
+  op_params.padding_type = getPaddingType(_paddingType);
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = _dilationWidthFactor;
+  op_params.dilation_height_factor = _dilationHeightFactor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::ruy::Conv &kernel = *_conv_kernel;
+  kernel(op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+         getTensorShape(_kernel), reinterpret_cast<const float *>(_kernel->buffer()),
+         getTensorShape(_bias), reinterpret_cast<const float *>(_bias->buffer()),
+         getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
+         _external_context->ruy_context());
+}
+
+void ConvolutionLayer::configure(const IPortableTensor *input, const IPortableTensor *kernel,
+                                 const IPortableTensor *bias, const ir::PaddingType paddingType,
+                                 const uint32_t paddingLeft, const uint32_t paddingRight,
+                                 const uint32_t paddingTop, const uint32_t paddingBottom,
+                                 const uint32_t strideWidth, const uint32_t strideHeight,
+                                 const uint32_t dilationWidthFactor,
+                                 const uint32_t dilationHeightFactor,
+                                 const ir::Activation activation, IPortableTensor *output,
+                                 const std::shared_ptr<ExternalContext> &external_context)
+{
+  _input = input;
+  _kernel = kernel;
+  _bias = bias;
+  _paddingType = paddingType;
+  _paddingLeft = paddingLeft;
+  _paddingRight = paddingRight;
+  _paddingTop = paddingTop;
+  _paddingBottom = paddingBottom;
+  _strideWidth = strideWidth;
+  _strideHeight = strideHeight;
+  _dilationWidthFactor = dilationWidthFactor;
+  _dilationHeightFactor = dilationHeightFactor;
+  _activation = activation;
+  _output = output;
+  _external_context = external_context;
+}
+
+void ConvolutionLayer::run()
+{
+  prepare();
+
+  if (_input->is_dynamic() || _kernel->is_dynamic())
+  {
+    const auto ifm_shape = _input->getShape().asFeature(_input->layout());
+    const auto ofm_shape = _output->getShape().asFeature(_input->layout());
+    // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+    const auto ker_shape = _kernel->getShape();
+    const auto ker_height = ker_shape.dim(1);
+    const auto ker_width = ker_shape.dim(2);
+
+    ir::Stride stride;
+    stride.vertical = _strideWidth;
+    stride.horizontal = _strideWidth;
+
+    ir::Padding param_padding;
+    param_padding.type = _paddingType;
+    param_padding.param.left = _paddingLeft;
+    param_padding.param.right = _paddingRight;
+    param_padding.param.top = _paddingTop;
+    param_padding.param.bottom = _paddingBottom;
+
+    const auto padding =
+        ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                             _dilationWidthFactor, _dilationHeightFactor);
+
+    _paddingLeft = padding.left;
+    _paddingRight = padding.right;
+    _paddingTop = padding.top;
+    _paddingBottom = padding.bottom;
+  }
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    convFloat32();
+  }
+  else
+  {
+    throw std::runtime_error{"Conv: unsupported data type"};
+  }
+}
+
+void ConvolutionLayer::prepare()
+{
+  if (_prepare)
+    return;
+
+  nnfw::ruy::Conv &kernel = *_conv_kernel;
+  if (_input->data_type() == OperandType::FLOAT32 && _kernel->is_constant())
+  {
+    kernel.prepare(getTensorShape(_input), getTensorShape(_kernel), getTensorShape(_output),
+                   _strideWidth, _strideHeight, _dilationWidthFactor, _dilationHeightFactor);
+  }
+  _prepare = true;
+}
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/ops/ConvolutionLayer.h b/runtime/onert/backend/ruy/ops/ConvolutionLayer.h
new file mode 100644
index 0000000..a55387b
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/ConvolutionLayer.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_OPS_CONVOLUTIONLAYER_H__
+#define __ONERT_BACKEND_RUY_OPS_CONVOLUTIONLAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "../ExternalContext.h"
+#include "OperationUtils.h"
+
+#include <ruy/operation/Conv.h>
+#include <exec/IFunction.h>
+#include <functional>
+#include <memory>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+
+class ConvolutionLayer : public ::onert::exec::IFunction
+{
+public:
+  ConvolutionLayer();
+  ~ConvolutionLayer();
+
+public:
+  void convFloat32();
+
+  void configure(const IPortableTensor *input, const IPortableTensor *kernel,
+                 const IPortableTensor *bias, ir::PaddingType _paddingType,
+                 const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
+                 const uint32_t paddingBottom, const uint32_t strideWidth,
+                 const uint32_t strideHeight, const uint32_t dilationWidthFactor,
+                 const uint32_t dilationHeightFactor, const ir::Activation activation,
+                 IPortableTensor *output, const std::shared_ptr<ExternalContext> &external_context);
+
+  void run() override;
+
+  void prepare() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_kernel;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+
+  ir::PaddingType _paddingType;
+  uint32_t _paddingLeft;
+  uint32_t _paddingTop;
+  uint32_t _paddingRight;
+  uint32_t _paddingBottom;
+
+  uint32_t _strideWidth;
+  uint32_t _strideHeight;
+  uint32_t _dilationWidthFactor;
+  uint32_t _dilationHeightFactor;
+
+  ir::Activation _activation;
+
+  std::unique_ptr<nnfw::ruy::Conv> _conv_kernel;
+
+  bool _prepare;
+
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_OPS_CONVOLUTIONLAYER_H__
diff --git a/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc
new file mode 100644
index 0000000..af693e3
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FullyConnectedLayer.h"
+
+#include "../Tensor.h"
+#include <ruy/operation/FullyConnected.h>
+#include <ruy/TensorUtils.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+
+FullyConnectedLayer::FullyConnectedLayer()
+    : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
+      _activation(ir::Activation::NONE), _external_context(nullptr)
+{
+  // DO NOTHING
+}
+
+FullyConnectedLayer::~FullyConnectedLayer() = default;
+
+void FullyConnectedLayer::fullyConnectedFloat32()
+{
+  float output_activation_min = 0, output_activation_max = 0;
+  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
+  nnfw::ruy::FullyConnectedParams op_params;
+
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  op_params.activation = convertActivationType(_activation);
+  op_params.lhs_cacheable = _weights->is_constant();
+  op_params.rhs_cacheable = _input->is_constant();
+
+  nnfw::ruy::FullyConnected(
+      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
+      _external_context->ruy_context());
+}
+
+void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
+                                    const IPortableTensor *bias, ir::Activation activation,
+                                    ir::FullyConnectedWeightsFormat weights_format,
+                                    IPortableTensor *output,
+                                    const std::shared_ptr<ExternalContext> &external_context)
+{
+  UNUSED_RELEASE(weights_format);
+  _input = input;
+  _weights = weights;
+  _bias = bias;
+  _activation = activation;
+  _output = output;
+  _external_context = external_context;
+}
+
+void FullyConnectedLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    fullyConnectedFloat32();
+  }
+  else
+  {
+    throw std::runtime_error{"FullyConnected: unsupported data type"};
+  }
+}
+
+void FullyConnectedLayer::prepare()
+{
+  if (_bias && _bias->is_constant())
+  {
+    const int bias_size = getTensorShape(_bias).FlatSize();
+    if (nnfw::ruy::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+    {
+      _bias = nullptr;
+    }
+  }
+}
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/ops/FullyConnectedLayer.h b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.h
new file mode 100644
index 0000000..33d560f
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_OPS_FULLYCONNECTEDLAYER_H__
+#define __ONERT_BACKEND_RUY_OPS_FULLYCONNECTEDLAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "../ExternalContext.h"
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+
+class FullyConnectedLayer : public ::onert::exec::IFunction
+{
+public:
+  FullyConnectedLayer();
+  ~FullyConnectedLayer();
+
+public:
+  void fullyConnectedFloat32();
+
+  void configure(const IPortableTensor *input, const IPortableTensor *weights,
+                 const IPortableTensor *bias, ir::Activation activation,
+                 ir::FullyConnectedWeightsFormat weights_format, IPortableTensor *output,
+                 const std::shared_ptr<ExternalContext> &external_context);
+
+  void run() override;
+
+  void prepare() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_weights;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+
+  ir::Activation _activation;
+
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_OPS_FULLYCONNECTEDLAYER_H__
diff --git a/runtime/onert/backend/ruy/ops/OperationUtils.cc b/runtime/onert/backend/ruy/ops/OperationUtils.cc
new file mode 100644
index 0000000..929107b
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/OperationUtils.cc
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OperationUtils.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+
+nnfw::ruy::PaddingType getPaddingType(ir::PaddingType ir_padding_type)
+{
+  switch (ir_padding_type)
+  {
+    case ir::PaddingType::EXPLICIT:
+      return nnfw::ruy::PaddingType::kNone;
+    case ir::PaddingType::SAME:
+      return nnfw::ruy::PaddingType::kSame;
+    case ir::PaddingType::VALID:
+      return nnfw::ruy::PaddingType::kValid;
+    default:
+      throw std::runtime_error("Wrong padding type.");
+      break;
+  }
+}
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/ruy/ops/OperationUtils.h b/runtime/onert/backend/ruy/ops/OperationUtils.h
new file mode 100644
index 0000000..5dfdc7e
--- /dev/null
+++ b/runtime/onert/backend/ruy/ops/OperationUtils.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_RUY_OPS_OPERATION_UTILS_H__
+#define __ONERT_BACKEND_RUY_OPS_OPERATION_UTILS_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <ruy/Shape.h>
+#include <ruy/Types.h>
+#include <iostream>
+#include <ir/DataType.h>
+#include <ir/InternalType.h>
+#include <ir/Padding.h>
+
+#include <limits>
+
+using OperandType = onert::ir::DataType;
+
+namespace onert
+{
+namespace backend
+{
+namespace ruy
+{
+namespace ops
+{
+
+inline nnfw::ruy::Shape getTensorShape(const IPortableTensor *tensor)
+{
+  if (tensor == nullptr)
+    return nnfw::ruy::Shape();
+
+  const ir::Shape &shape = tensor->get_info().shape();
+
+  assert(tensor->layout() == ir::Layout::NHWC);
+
+  auto rank = shape.rank();
+  nnfw::ruy::Shape ret(rank);
+  auto data = ret.DimsData();
+  for (int i = 0; i < rank; ++i)
+  {
+    data[i] = shape.dim(i);
+  }
+  return ret;
+}
+
+inline nnfw::ruy::FusedActivationFunctionType convertActivationType(const ir::Activation activation)
+{
+  switch (activation)
+  {
+    case ir::Activation::NONE:
+      return nnfw::ruy::FusedActivationFunctionType::kNone;
+    case ir::Activation::RELU:
+      return nnfw::ruy::FusedActivationFunctionType::kRelu;
+    case ir::Activation::RELU1:
+      return nnfw::ruy::FusedActivationFunctionType::kRelu1;
+    case ir::Activation::RELU6:
+      return nnfw::ruy::FusedActivationFunctionType::kRelu6;
+    case ir::Activation::TANH:
+      return nnfw::ruy::FusedActivationFunctionType::kTanh;
+    case ir::Activation::SIGMOID:
+      return nnfw::ruy::FusedActivationFunctionType::kSigmoid;
+    default:
+      throw std::runtime_error{"RUY backend: Cannot convert activation type"};
+  }
+}
+
+template <typename T>
+void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
+{
+  if (activation == ir::Activation::RELU)
+  {
+    *activation_min = 0;
+    *activation_max = std::numeric_limits<T>::max();
+  }
+  else if (activation == ir::Activation::RELU6)
+  {
+    *activation_min = 0;
+    *activation_max = 6;
+  }
+  else if (activation == ir::Activation::RELU1)
+  {
+    *activation_min = -1;
+    *activation_max = 1;
+  }
+  else if (activation == ir::Activation::SIGMOID)
+  {
+    *activation_min = 0;
+    *activation_max = 1;
+  }
+  else if (activation == ir::Activation::NONE)
+  {
+    *activation_min = std::numeric_limits<T>::lowest();
+    *activation_max = std::numeric_limits<T>::max();
+  }
+  else
+  {
+    std::cout << "Unsupported fused activation function." << std::endl;
+  }
+}
+
+nnfw::ruy::PaddingType getPaddingType(ir::PaddingType ir_padding_type);
+
+} // namespace ops
+} // namespace ruy
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_RUY_OPS_OPERATION_UTILS_H__
diff --git a/runtime/onert/backend/ruy/ruy.cc b/runtime/onert/backend/ruy/ruy.cc
new file mode 100644
index 0000000..4f33590
--- /dev/null
+++ b/runtime/onert/backend/ruy/ruy.cc
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Backend.h"
+
+extern "C" {
+
+onert::backend::Backend *onert_backend_create() { return new onert::backend::ruy::Backend; }
+
+void onert_backend_destroy(onert::backend::Backend *backend) { delete backend; }
+}
diff --git a/runtime/onert/backend/xnnpack/Backend.h b/runtime/onert/backend/xnnpack/Backend.h
new file mode 100644
index 0000000..b7aef16
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/Backend.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_BACKEND_H__
+#define __ONERT_BACKEND_XNNPACK_BACKEND_H__
+
+#include "BackendContext.h"
+#include "Config.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+
+#include <backend/Backend.h>
+
+#include <memory>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class Backend : public ::onert::backend::Backend
+{
+public:
+  Backend() : _config{std::make_shared<Config>()} {}
+
+  std::shared_ptr<IConfig> config() const override { return _config; }
+
+  std::unique_ptr<onert::backend::BackendContext>
+  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
+             bool) const override
+  {
+    const auto &operands = graph.operands();
+    const auto &operations = graph.operations();
+    auto context = std::make_unique<BackendContext>(this, &graph);
+    auto tr = std::make_shared<cpu_common::TensorRegistry>();
+    auto tb = std::make_shared<TensorBuilder>(tr);
+    context->tensor_registry = tr;
+    context->tensor_builder = tb;
+    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
+    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
+                                                            context->external_context());
+    return context;
+  }
+
+private:
+  std::shared_ptr<IConfig> _config;
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_BACKEND_H__
diff --git a/runtime/onert/backend/xnnpack/BackendContext.cc b/runtime/onert/backend/xnnpack/BackendContext.cc
new file mode 100644
index 0000000..503d088
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/BackendContext.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+#include "backend/cpu_common/BackendContextHelpers.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                  ir::Remove::DUPLICATED;
+  for (auto index : operand_list())
+  {
+    if (model_io.contains(index))
+      continue;
+    const auto &obj = graph()->operands().at(index);
+    const auto frontend_layout = [&]() {
+      if (obj.getUses().size() == 0)
+        return ir::Layout::UNKNOWN;
+      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
+      for (auto &operation_info : operation_list())
+      {
+        if (operation_info.index == use_op_ind)
+          return operation_info.layout;
+      }
+      return ir::Layout::UNKNOWN;
+    }();
+    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
+    if (permute_factor.backend() != backend())
+      continue;
+    const auto backend_layout = permute_factor.layout();
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    cpu_common::planTensors(*this, order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return ret;
+}
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/BackendContext.h b/runtime/onert/backend/xnnpack/BackendContext.h
new file mode 100644
index 0000000..f81175b
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/BackendContext.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_XNNPACK_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include <util/ConfigSource.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+#include "ExternalContext.h"
+
+namespace
+{
+const int kDefaultNumThreadpoolThreads = 1;
+}
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}, _external_context(nullptr)
+  {
+    int num_threads = util::getConfigInt(util::config::XNNPACK_THREADS);
+    if (num_threads < 1)
+      num_threads = kDefaultNumThreadpoolThreads; // default num of threads
+    _external_context.reset(new ExternalContext(static_cast<size_t>(num_threads)));
+  }
+
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+
+  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
+  std::shared_ptr<ExternalContext> external_context() { return _external_context; }
+
+private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  // TODO Make it private
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+
+private:
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/xnnpack/CMakeLists.txt b/runtime/onert/backend/xnnpack/CMakeLists.txt
new file mode 100644
index 0000000..e3de31e
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(LIB_ONERT_BACKEND_XNNPACK onert_backend_xnnpack)
+
+# Unsupported architecture
+nnfw_find_package(Xnnpack QUIET)
+if(NOT Xnnpack_FOUND)
+  return()
+endif(NOT Xnnpack_FOUND)
+
+file(GLOB_RECURSE SOURCES "*.cc")
+
+add_library(${LIB_ONERT_BACKEND_XNNPACK} SHARED ${SOURCES})
+
+target_link_libraries(${LIB_ONERT_BACKEND_XNNPACK} PRIVATE onert_core)
+target_link_libraries(${LIB_ONERT_BACKEND_XNNPACK} PRIVATE nnfw_common)
+target_link_libraries(${LIB_ONERT_BACKEND_XNNPACK} PRIVATE nnfw_coverage)
+target_link_libraries(${LIB_ONERT_BACKEND_XNNPACK} PRIVATE pthreadpool)
+target_link_libraries(${LIB_ONERT_BACKEND_XNNPACK} PRIVATE XNNPACK)
+
+set_target_properties(${LIB_ONERT_BACKEND_XNNPACK} PROPERTIES OUTPUT_NAME backend_xnnpack)
+
+if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
+  add_custom_command(TARGET ${LIB_ONERT_BACKEND_XNNPACK} POST_BUILD
+                     COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:${LIB_ONERT_BACKEND_XNNPACK}>)
+endif()
+
+install(TARGETS ${LIB_ONERT_BACKEND_XNNPACK} DESTINATION lib)
diff --git a/runtime/onert/core/include/backend/IOptimizer.h b/runtime/onert/backend/xnnpack/Config.cc
similarity index 62%
rename from runtime/onert/core/include/backend/IOptimizer.h
rename to runtime/onert/backend/xnnpack/Config.cc
index 4844d21..4d42a3f 100644
--- a/runtime/onert/core/include/backend/IOptimizer.h
+++ b/runtime/onert/backend/xnnpack/Config.cc
@@ -14,38 +14,31 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_I_OPTIMIZER_H__
-#define __ONERT_BACKEND_I_OPTIMIZER_H__
+#include "Config.h"
 
-namespace onert
-{
-namespace ir
-{
-class LoweredGraph;
-}
-} // namespace onert
+#include <xnnpack.h>
 
 namespace onert
 {
 namespace backend
 {
+namespace xnnpack
+{
 
-/**
- * @brief Class for backend optimizations. This is an optional class so not all backends must have
- * it.
- *
- */
-struct IOptimizer
+Config::~Config() { xnn_deinitialize(); }
+
+bool Config::initialize()
 {
-  virtual ~IOptimizer() = default;
-  /**
-   * @brief Run optimization
-   *
-   */
-  virtual void optimize() = 0;
-};
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to initialize XNNPACK"};
+  }
+  return true;
+}
+
+ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; }
 
+} // namespace cpu
 } // namespace backend
 } // namespace onert
-
-#endif // __ONERT_BACKEND_I_OPTIMIZER_H__
diff --git a/runtime/onert/backend/xnnpack/Config.h b/runtime/onert/backend/xnnpack/Config.h
new file mode 100644
index 0000000..2cf7406
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/Config.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_CONFIG_H__
+#define __ONERT_BACKEND_XNNPACK_CONFIG_H__
+
+#include <backend/IConfig.h>
+#include <memory>
+#include <util/ITimer.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class Config : public IConfig
+{
+public:
+  virtual ~Config();
+
+public:
+  std::string id() override { return "xnnpack"; }
+  bool initialize() override;
+  ir::Layout supportLayout(const ir::Operation &node, ir::Layout frontend_layout) override;
+  bool supportPermutation() override { return true; }
+  bool supportDynamicTensor() override { return true; }
+  bool supportFP16() override { return false; }
+
+  std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); }
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_CONFIG_H__
diff --git a/runtime/onert/backend/xnnpack/ConstantInitializer.h b/runtime/onert/backend/xnnpack/ConstantInitializer.h
new file mode 100644
index 0000000..45cdd8c
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ConstantInitializer.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
+
+#include <backend/cpu_common/ConstantInitializer.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+using ConstantInitializer = cpu_common::ConstantInitializer;
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/core/include/backend/IExternalContext.h b/runtime/onert/backend/xnnpack/ExternalContext.cc
similarity index 74%
rename from runtime/onert/core/include/backend/IExternalContext.h
rename to runtime/onert/backend/xnnpack/ExternalContext.cc
index 88ffb50..3a9fe1b 100644
--- a/runtime/onert/core/include/backend/IExternalContext.h
+++ b/runtime/onert/backend/xnnpack/ExternalContext.cc
@@ -14,21 +14,23 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_IEXTERNAL_CONTEXT_H__
-#define __ONERT_BACKEND_IEXTERNAL_CONTEXT_H__
+#include "ExternalContext.h"
+
+#include <cassert>
 
 namespace onert
 {
 namespace backend
 {
+namespace xnnpack
+{
 
-struct IExternalContext
+ExternalContext::ExternalContext(size_t num_threads)
+    : _threadpool(pthreadpool_create(num_threads), pthreadpool_destroy)
 {
-  virtual ~IExternalContext() = default;
-  virtual void setMaxNumThreads(int) = 0;
-};
+  assert(_threadpool);
+}
 
+} // namespace xnnpack
 } // namespace backend
 } // namespace onert
-
-#endif // __ONERT_BACKEND_IEXTERNAL_CONTEXT__
diff --git a/runtime/onert/backend/xnnpack/ExternalContext.h b/runtime/onert/backend/xnnpack/ExternalContext.h
new file mode 100644
index 0000000..682fd2e
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ExternalContext.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_XNNPACK_EXTERNAL_CONTEXT_H__
+
+#include <memory>
+#include <xnnpack.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class ExternalContext
+{
+public:
+  ExternalContext(size_t num_threads);
+
+public:
+  pthreadpool *getThreadPool() { return _threadpool.get(); }
+
+private:
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> _threadpool;
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/backend/xnnpack/KernelGenerator.cc b/runtime/onert/backend/xnnpack/KernelGenerator.cc
new file mode 100644
index 0000000..b7d3f60
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/KernelGenerator.cc
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "KernelGenerator.h"
+
+#include "ops/ConvolutionLayer.h"
+#include "ops/DepthwiseConvolutionLayer.h"
+#include "ops/FullyConnectedLayer.h"
+
+#include <backend/Backend.h>
+#include <backend/IConfig.h>
+#include <memory>
+#include <util/Utils.h>
+#include <util/logging.h>
+#include <exec/DynamicShapeInferer.h>
+
+#include <stdexcept>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+KernelGenerator::KernelGenerator(
+    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
+    const std::shared_ptr<TensorBuilder> &tensor_builder,
+    const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+    const std::shared_ptr<ExternalContext> &external_context)
+    : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
+      _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
+      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+{
+  // DO NOTHING
+}
+
+void KernelGenerator::visit(const ir::OpSequence &op_seq)
+{
+  assert(!_return_fn_seq);
+  assert(_tensor_builder->dynamicTensorManager());
+  assert(_tensor_reg);
+
+  auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
+
+  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
+
+  // Prepare to handle dynamic tensors later
+  auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
+  {
+    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->operations = &_operations_ctx;
+    dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
+    dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
+
+    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+  }
+
+  _current_layout = op_seq.getLayout();
+  for (const auto &operation_idx : op_seq.operations())
+  {
+    const auto &node = _operations_ctx.at(operation_idx);
+    node.accept(*this);
+    _return_fn_seq->append(releaseFunction());
+
+    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+    {
+      auto portable_tensor = _tensor_reg->getPortableTensor(ind);
+      if (portable_tensor)
+      {
+        assert(portable_tensor->layout() == ir::Layout::NHWC);
+      }
+
+      auto tensor = _tensor_reg->getNativeTensor(ind);
+      if (tensor)
+      {
+        tensor->increase_ref();
+      }
+    }
+  }
+}
+
+void KernelGenerator::visit(const ir::operation::Conv2D &node)
+{
+  using ir::operation::Conv2D;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
+
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
+
+  const auto stride = node.param().stride;
+  const auto activation = node.param().activation;
+  const auto param_padding = node.param().padding;
+  const auto dilation = node.param().dilation;
+  auto fn = std::make_unique<ops::ConvolutionLayer>(_external_context);
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto padding =
+      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                           dilation.width_factor, dilation.height_factor);
+
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+                dilation.width_factor, dilation.height_factor, activation, ofm_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+{
+  using ir::operation::DepthwiseConv2D;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
+
+  const auto stride = node.param().stride;
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  // Kernel format is [1, kernel_height, kernel_width, depth_out].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+  const auto dilation_width = node.param().dilation.width_factor;
+  const auto dilation_height = node.param().dilation.height_factor;
+  const auto param_padding = node.param().padding;
+  const auto padding = ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width,
+                                            ker_height, dilation_width, dilation_height);
+  const auto multiplier = node.param().multiplier;
+  const auto activation = node.param().activation;
+
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
+
+  auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>(_external_context);
+
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+                multiplier, dilation_width, dilation_height, activation, ofm_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+  const auto activation = node.param().activation;
+
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
+  auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
+
+  auto fn = std::make_unique<ops::FullyConnectedLayer>(_external_context);
+
+  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/KernelGenerator.h b/runtime/onert/backend/xnnpack/KernelGenerator.h
new file mode 100644
index 0000000..2658242
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/KernelGenerator.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_KERNEL_GENERATOR_H__
+#define __ONERT_BACKEND_XNNPACK_KERNEL_GENERATOR_H__
+
+#include "ExternalContext.h"
+#include "TensorBuilder.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "Tensor.h"
+
+#include <backend/CustomKernelBuilder.h>
+#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <ir/Operands.h>
+#include <ir/Operations.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class KernelGenerator : public cpu_common::KernelGeneratorBase
+{
+public:
+  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
+                  const std::shared_ptr<TensorBuilder> &tensor_builder,
+                  const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+                  const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
+                  const std::shared_ptr<ExternalContext> &external_context);
+
+  void visit(const ir::OpSequence &) override;
+  void visit(const ir::operation::Conv2D &) override;
+  void visit(const ir::operation::DepthwiseConv2D &) override;
+  void visit(const ir::operation::FullyConnected &) override;
+
+private:
+  const ir::Operands &_ctx;
+  const ir::Operations &_operations_ctx;
+  std::shared_ptr<TensorBuilder> _tensor_builder;
+  std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
+  ir::Layout _current_layout;
+  const std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_KERNEL_GENERATOR_H__
diff --git a/runtime/onert/backend/xnnpack/StaticTensorManager.h b/runtime/onert/backend/xnnpack/StaticTensorManager.h
new file mode 100644
index 0000000..f7344e8
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/StaticTensorManager.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_XNNPACK_STATICTENSOR_MANAGER_H__
+
+#include "backend/cpu_common/StaticTensorManager.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+using StaticTensorManager = cpu_common::StaticTensorManager;
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/xnnpack/Tensor.h b/runtime/onert/backend/xnnpack/Tensor.h
new file mode 100644
index 0000000..b39cbd2
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/Tensor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_TENSOR_H__
+#define __ONERT_BACKEND_XNNPACK_TENSOR_H__
+
+#include <backend/cpu_common/Tensor.h>
+#include <ir/Data.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+using Tensor = cpu_common::Tensor;
+using ExternalTensor = cpu_common::ExternalTensor;
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_TENSOR_H__
diff --git a/runtime/onert/backend/xnnpack/TensorBuilder.cc b/runtime/onert/backend/xnnpack/TensorBuilder.cc
new file mode 100644
index 0000000..b570144
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/TensorBuilder.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorBuilder.h"
+
+#include <util/logging.h>
+
+#include <cassert>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+TensorBuilder::TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg)
+    : _tensor_reg{tensor_reg},
+      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
+      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
+{
+  /* empty */
+}
+
+void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+                                       ir::Layout layout)
+{
+  _tensor_info_map.emplace(ind, info);
+
+  // XNNPACK backend supports only one layout as NHWC
+  assert(layout == ir::Layout::NHWC);
+  if (info.isDynamic())
+  {
+    _dynamic_tensor_mgr->buildTensor(ind, info, layout);
+  }
+  else
+  {
+    _static_tensor_mgr->buildTensor(ind, info, layout, info.isConstant());
+  }
+}
+
+void TensorBuilder::notifyFirstUse(const ir::OperandIndex &ind)
+{
+  assert(_tensor_info_map.find(ind) != _tensor_info_map.end());
+  const auto tensor_info = _tensor_info_map.at(ind);
+
+  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
+  {
+    const auto size = tensor_info.total_size();
+    _static_tensor_mgr->claimPlan(ind, size);
+  }
+}
+
+void TensorBuilder::notifyLastUse(const ir::OperandIndex &ind)
+{
+  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
+  {
+    _static_tensor_mgr->releasePlan(ind);
+  }
+}
+
+bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
+{
+  return _tensor_info_map.find(ind) != _tensor_info_map.end();
+}
+
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
+
+void TensorBuilder::allocate()
+{
+  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
+  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
+}
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/TensorBuilder.h b/runtime/onert/backend/xnnpack/TensorBuilder.h
new file mode 100644
index 0000000..dddfedb
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/TensorBuilder.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_XNNPACK_TENSOR_BUILDER_H__
+
+#include <backend/cpu_common/DynamicTensorManager.h>
+#include <backend/cpu_common/TensorRegistry.h>
+
+#include <ir/OperandIndexMap.h>
+
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
+#include <unordered_map>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+
+class TensorBuilder
+{
+public:
+  TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
+
+  /**
+   * @brief     Register tensor information to allocate on XNNPACK backend
+   * @param[in] ind    Operand index
+   * @param[in] info   Operand information
+   * @param[in] layout Operand data layout
+   */
+  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+                          ir::Layout backend_layout);
+
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
+
+  bool isRegistered(const ir::OperandIndex &) const;
+
+  void prepare(void);
+  void allocate();
+  void postFunctionPrepare() { /* DO NOTHING */}
+
+  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
+
+private:
+  const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
+  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
+  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+};
+
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_TENSOR_BUILDER_H__
diff --git a/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
new file mode 100644
index 0000000..0612995
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvolutionLayer.h"
+
+#include "ir/Padding.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+ConvolutionLayer::ConvolutionLayer(const std::shared_ptr<ExternalContext> external_context)
+    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+      _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0),
+      _padding_right(0), _padding_bottom(0), _stride_width(0), _stride_height(0),
+      _dilation_width_factor(1), _dilation_height_factor(1), _activation(ir::Activation::NONE)
+{
+  // DO NOTHING
+}
+
+void ConvolutionLayer::configure(const IPortableTensor *input, const IPortableTensor *kernel,
+                                 const IPortableTensor *bias, ir::PaddingType padding_type,
+                                 const uint32_t padding_left, const uint32_t padding_right,
+                                 const uint32_t padding_top, const uint32_t padding_bottom,
+                                 const uint32_t stride_width, const uint32_t stride_height,
+                                 const uint32_t dilation_width_factor,
+                                 const uint32_t dilation_height_factor,
+                                 const ir::Activation activation, IPortableTensor *output)
+{
+  _input = input;
+  _kernel = kernel;
+  _bias = bias;
+  _padding_type = padding_type;
+  _padding_left = padding_left;
+  _padding_right = padding_right;
+  _padding_top = padding_top;
+  _padding_bottom = padding_bottom;
+  _stride_width = stride_width;
+  _stride_height = stride_height;
+  _dilation_width_factor = dilation_width_factor;
+  _dilation_height_factor = dilation_height_factor;
+  _activation = activation;
+  _output = output;
+
+  // TODO Support not nhwc layer
+  assert(_input->layout() == ir::Layout::NHWC);
+
+  assert(_activation == ir::Activation::NONE || _activation == ir::Activation::RELU ||
+         _activation == ir::Activation::RELU1 || _activation == ir::Activation::RELU6);
+}
+
+void ConvolutionLayer::run()
+{
+  assert(_external_context && _external_context->getThreadPool());
+  if (!_setup)
+  {
+    _setup = setup();
+    assert(_setup);
+  }
+
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    enum xnn_status status = xnn_run_operator(_kernel_op, _external_context->getThreadPool());
+    if (status != xnn_status_success)
+    {
+      throw std::runtime_error{"failed to run FP32 Convolution operator"};
+    }
+  }
+  else
+  {
+    throw std::runtime_error{"XNNPACK Conv: unsupported data type"};
+  }
+}
+
+bool ConvolutionLayer::create()
+{
+  float output_activation_min = 0.f, output_activation_max = 0.f;
+  CalculateActivationRange<float>(_activation, &output_activation_min, &output_activation_max);
+
+  // NHWC
+  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+  const auto &kernel_shape = _kernel->getShape();
+  uint32_t kernel_height = kernel_shape.dim(1);
+  uint32_t kernel_width = kernel_shape.dim(2);
+  uint32_t output_channels = kernel_shape.dim(0);
+  uint32_t input_channels = kernel_shape.dim(3);
+  assert(static_cast<uint32_t>(_input->getShape().dim(3)) == input_channels);
+  assert(static_cast<uint32_t>(_output->getShape().dim(3)) == output_channels);
+
+  enum xnn_status status = xnn_create_convolution2d_nhwc_f32(
+      _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
+      _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor,
+      1 /* groups */, input_channels /* group_input_channels */,
+      output_channels /* group_output_channels */, input_channels /* input_channel_stride */,
+      output_channels /* output_channel_stride */,
+      reinterpret_cast<const float *>(_kernel->buffer()),
+      reinterpret_cast<const float *>(_bias->buffer()), output_activation_min,
+      output_activation_max, 0, &_kernel_op);
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 Convolution operator"};
+  }
+  assert(_kernel_op != nullptr);
+  return true;
+}
+
+bool ConvolutionLayer::setup()
+{
+  if (_input->buffer() == nullptr || _output->buffer() == nullptr)
+  {
+    // it could be models's input or output
+    return false;
+  }
+
+  uint32_t input_width = _input->getShape().dim(2);
+  uint32_t input_height = _input->getShape().dim(1);
+  uint32_t batch_size = _input->getShape().dim(0);
+  enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
+      _kernel_op, batch_size, input_height, input_width,
+      reinterpret_cast<const float *>(_input->buffer()),
+      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 Convolution operator"};
+  }
+  return true;
+}
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.h b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.h
new file mode 100644
index 0000000..6cbaa9f
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_OPS_CONVOLUTION_LAYER_H__
+#define __ONERT_BACKEND_XNNPACK_OPS_CONVOLUTION_LAYER_H__
+
+#include "Layer.h"
+
+#include <xnnpack.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+class ConvolutionLayer : public Layer
+{
+public:
+  ConvolutionLayer(const std::shared_ptr<ExternalContext> external_context);
+
+public:
+  void configure(const IPortableTensor *input, const IPortableTensor *kernel,
+                 const IPortableTensor *bias, ir::PaddingType padding_type,
+                 const uint32_t padding_left, const uint32_t padding_right,
+                 const uint32_t padding_top, const uint32_t padding_bottom,
+                 const uint32_t stride_width, const uint32_t stride_height,
+                 const uint32_t dilation_width_factor, const uint32_t dilation_height_factor,
+                 const ir::Activation activation, IPortableTensor *output);
+
+  void run() override;
+
+  bool create() override;
+  bool setup() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_kernel;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+
+  ir::PaddingType _padding_type;
+  uint32_t _padding_left;
+  uint32_t _padding_top;
+  uint32_t _padding_right;
+  uint32_t _padding_bottom;
+
+  uint32_t _stride_width;
+  uint32_t _stride_height;
+  uint32_t _dilation_width_factor;
+  uint32_t _dilation_height_factor;
+
+  ir::Activation _activation;
+};
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_OPS_CONVOLUTION_LAYER_H__
diff --git a/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
new file mode 100644
index 0000000..947f041
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthwiseConvolutionLayer.h"
+
+#include "ir/Padding.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+DepthwiseConvolutionLayer::DepthwiseConvolutionLayer(
+    const std::shared_ptr<ExternalContext> external_context)
+    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+      _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0),
+      _padding_right(0), _padding_bottom(0), _stride_width(0), _stride_height(0), _multiplier(1),
+      _dilation_width_factor(1), _dilation_height_factor(1), _activation(ir::Activation::NONE)
+{
+  // DO NOTHING
+}
+
+void DepthwiseConvolutionLayer::configure(
+    const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
+    ir::PaddingType padding_type, const uint32_t padding_left, const uint32_t padding_right,
+    const uint32_t padding_top, const uint32_t padding_bottom, const uint32_t stride_width,
+    const uint32_t stride_height, const uint32_t multiplier, const uint32_t dilation_width_factor,
+    const uint32_t dilation_height_factor, const ir::Activation activation, IPortableTensor *output)
+{
+  _input = input;
+  _kernel = kernel;
+  _bias = bias;
+  _padding_type = padding_type;
+  _padding_left = padding_left;
+  _padding_right = padding_right;
+  _padding_top = padding_top;
+  _padding_bottom = padding_bottom;
+  _stride_width = stride_width;
+  _stride_height = stride_height;
+  _multiplier = multiplier;
+  _dilation_width_factor = dilation_width_factor;
+  _dilation_height_factor = dilation_height_factor;
+  _activation = activation;
+  _output = output;
+
+  // TODO Support not nhwc layer
+  assert(_input->layout() == ir::Layout::NHWC);
+
+  assert(_activation == ir::Activation::NONE || _activation == ir::Activation::RELU ||
+         _activation == ir::Activation::RELU1 || _activation == ir::Activation::RELU6);
+}
+
+void DepthwiseConvolutionLayer::run()
+{
+  assert(_external_context && _external_context->getThreadPool());
+  if (!_setup)
+  {
+    _setup = setup();
+    assert(_setup);
+  }
+
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    enum xnn_status status = xnn_run_operator(_kernel_op, _external_context->getThreadPool());
+    if (status != xnn_status_success)
+    {
+      throw std::runtime_error{"failed to run FP32 DepthwiseConvolution operator"};
+    }
+  }
+  else
+  {
+    throw std::runtime_error{"XNNPACK DepthwiseConv: unsupported data type"};
+  }
+}
+
+bool DepthwiseConvolutionLayer::create()
+{
+  float output_activation_min = 0.f, output_activation_max = 0.f;
+  CalculateActivationRange<float>(_activation, &output_activation_min, &output_activation_max);
+
+  // NHWC
+  // Kernel format is [1, kernel_height, kernel_width, depth_out].
+  const auto &kernel_shape = _kernel->getShape();
+  uint32_t kernel_height = kernel_shape.dim(1);
+  uint32_t kernel_width = kernel_shape.dim(2);
+  uint32_t output_channels = kernel_shape.dim(3);
+  uint32_t input_channels = _input->getShape().dim(3);
+  assert(static_cast<uint32_t>(_output->getShape().dim(3)) == output_channels);
+  assert(output_channels == input_channels * _multiplier);
+
+  enum xnn_status status = xnn_create_convolution2d_nhwc_f32(
+      _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
+      _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor,
+      input_channels /* groups */, 1 /* group_input_channels */,
+      _multiplier /* group_output_channels */, input_channels /* input_channel_stride */,
+      output_channels /* output_channel_stride */,
+      reinterpret_cast<const float *>(_kernel->buffer()),
+      reinterpret_cast<const float *>(_bias->buffer()), output_activation_min,
+      output_activation_max, XNN_FLAG_DEPTHWISE_CONVOLUTION, &_kernel_op);
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
+  }
+  assert(_kernel_op != nullptr);
+  return true;
+}
+
+bool DepthwiseConvolutionLayer::setup()
+{
+  if (_input->buffer() == nullptr || _output->buffer() == nullptr)
+  {
+    // it could be models's input or output
+    return false;
+  }
+
+  uint32_t input_width = _input->getShape().dim(2);
+  uint32_t input_height = _input->getShape().dim(1);
+  uint32_t batch_size = _input->getShape().dim(0);
+  enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
+      _kernel_op, batch_size, input_height, input_width,
+      reinterpret_cast<const float *>(_input->buffer()),
+      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
+  }
+  return true;
+}
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.h
new file mode 100644
index 0000000..10f840a
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_OPS_DEPTHWISE_CONVOLUTION_LAYER_H__
+#define __ONERT_BACKEND_XNNPACK_OPS_DEPTHWISE_CONVOLUTION_LAYER_H__
+
+#include "Layer.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+class DepthwiseConvolutionLayer : public Layer
+{
+public:
+  DepthwiseConvolutionLayer(const std::shared_ptr<ExternalContext> external_context);
+
+public:
+  void configure(const IPortableTensor *input, const IPortableTensor *kernel,
+                 const IPortableTensor *bias, ir::PaddingType padding_type,
+                 const uint32_t padding_left, const uint32_t padding_right,
+                 const uint32_t padding_top, const uint32_t padding_bottom,
+                 const uint32_t stride_width, const uint32_t stride_height,
+                 const uint32_t multiplier, const uint32_t dilation_width_factor,
+                 const uint32_t dilation_height_factor, const ir::Activation activation,
+                 IPortableTensor *output);
+
+  void run() override;
+
+  bool create() override;
+  bool setup() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_kernel;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+
+  ir::PaddingType _padding_type;
+  uint32_t _padding_left;
+  uint32_t _padding_top;
+  uint32_t _padding_right;
+  uint32_t _padding_bottom;
+
+  uint32_t _stride_width;
+  uint32_t _stride_height;
+  uint32_t _multiplier;
+  uint32_t _dilation_width_factor;
+  uint32_t _dilation_height_factor;
+
+  ir::Activation _activation;
+};
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_OPS_DEPTHWISE_CONVOLUTION_LAYER_H__
diff --git a/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
new file mode 100644
index 0000000..d595fda
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FullyConnectedLayer.h"
+
+#include "ir/Padding.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+FullyConnectedLayer::FullyConnectedLayer(const std::shared_ptr<ExternalContext> external_context)
+    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+      _activation(ir::Activation::NONE)
+{
+  // DO NOTHING
+}
+
+void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
+                                    const IPortableTensor *bias, ir::Activation activation,
+                                    IPortableTensor *output)
+{
+  _input = input;
+  _kernel = weights;
+  _bias = bias;
+  _activation = activation;
+  _output = output;
+
+  // TODO Support not nhwc layer
+  assert(_input->layout() == ir::Layout::NHWC);
+
+  assert(_activation == ir::Activation::NONE || _activation == ir::Activation::RELU ||
+         _activation == ir::Activation::RELU1 || _activation == ir::Activation::RELU6);
+}
+
+void FullyConnectedLayer::run()
+{
+  assert(_external_context && _external_context->getThreadPool());
+  if (!_setup)
+  {
+    _setup = setup();
+    assert(_setup);
+  }
+
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    enum xnn_status status = xnn_run_operator(_kernel_op, _external_context->getThreadPool());
+    if (status != xnn_status_success)
+    {
+      throw std::runtime_error{"failed to run FP32 FullyConnected operator"};
+    }
+  }
+  else
+  {
+    throw std::runtime_error{"XNNPACK FC: unsupported data type"};
+  }
+}
+
+bool FullyConnectedLayer::create()
+{
+  float output_activation_min = 0.f, output_activation_max = 0.f;
+  CalculateActivationRange<float>(_activation, &output_activation_min, &output_activation_max);
+
+  const auto &kernel_shape = _kernel->getShape();
+  assert(kernel_shape.rank() == 2);
+  uint32_t output_channels = kernel_shape.dim(0);
+  uint32_t input_channels = kernel_shape.dim(1);
+
+  const auto &input_shape = _input->getShape();
+  const auto &output_shape = _output->getShape();
+  uint32_t flag = 0;
+  if (input_shape.rank() != output_shape.rank())
+  {
+    flag |= XNN_FLAG_TENSORFLOW_RESHAPE_2D;
+    assert(input_shape.num_elements() % input_channels == 0);
+  }
+  else
+  {
+    assert(static_cast<uint32_t>(input_shape.dim(input_shape.rank() - 1)) == input_channels);
+  }
+
+  assert(_kernel && _kernel->buffer());
+  const float *kernel_buffer = reinterpret_cast<const float *>(_kernel->buffer());
+  const float *bias_buffer = (_bias) ? reinterpret_cast<const float *>(_bias->buffer()) : nullptr;
+
+  enum xnn_status status = xnn_create_fully_connected_nc_f32(
+      input_channels, output_channels, input_channels /* input stride */,
+      output_channels /* output stride */, kernel_buffer, bias_buffer, output_activation_min,
+      output_activation_max, flag, &_kernel_op);
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
+  }
+  assert(_kernel_op != nullptr);
+  return true;
+}
+
+bool FullyConnectedLayer::setup()
+{
+  if (_input->buffer() == nullptr || _output->buffer() == nullptr)
+  {
+    // it could be models's input or output
+    return false;
+  }
+
+  uint32_t batch_size = _input->getShape().num_elements() / _kernel->getShape().dim(1);
+  enum xnn_status status = xnn_setup_fully_connected_nc_f32(
+      _kernel_op, batch_size, reinterpret_cast<const float *>(_input->buffer()),
+      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
+  }
+  return true;
+}
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.h b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.h
new file mode 100644
index 0000000..883607e
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_OPS_FULLY_CONNECTED_LAYER_H__
+#define __ONERT_BACKEND_XNNPACK_OPS_FULLY_CONNECTED_LAYER_H__
+
+#include "Layer.h"
+
+#include <xnnpack.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+class FullyConnectedLayer : public Layer
+{
+public:
+  FullyConnectedLayer(const std::shared_ptr<ExternalContext> external_context);
+
+public:
+  void configure(const IPortableTensor *input, const IPortableTensor *_kernel,
+                 const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output);
+
+  void run() override;
+
+  bool create() override;
+  bool setup() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_kernel;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+
+  ir::Activation _activation;
+};
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_OPS_FULLY_CONNECTED_LAYER_H__
diff --git a/runtime/onert/backend/xnnpack/ops/Layer.h b/runtime/onert/backend/xnnpack/ops/Layer.h
new file mode 100644
index 0000000..68b610f
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/Layer.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_OPS_LAYER_H__
+#define __ONERT_BACKEND_XNNPACK_OPS_LAYER_H__
+
+#include <exec/IFunction.h>
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+#include "../ExternalContext.h"
+#include "../Tensor.h"
+
+#include <cassert>
+#include <memory>
+
+#include <xnnpack.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+class Layer : public ::onert::exec::IFunction
+{
+public:
+  Layer(const std::shared_ptr<ExternalContext> external_context)
+      : _kernel_op{nullptr}, _create{false}, _setup{false}, _external_context{external_context}
+  {
+    // DO NOTHING
+  }
+
+  ~Layer()
+  {
+    if (_kernel_op)
+      xnn_delete_operator(_kernel_op);
+  }
+
+public:
+  void prepare() override
+  {
+    if (_create)
+      return;
+
+    _create = create();
+    assert(_create);
+
+    _setup = setup();
+  }
+  virtual bool create() = 0;
+  virtual bool setup() = 0;
+
+protected:
+  xnn_operator_t _kernel_op;
+  bool _create;
+  bool _setup;
+  const std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_OPS_LAYER_H__
diff --git a/runtime/onert/backend/xnnpack/ops/OperationUtils.h b/runtime/onert/backend/xnnpack/ops/OperationUtils.h
new file mode 100644
index 0000000..5102e32
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/ops/OperationUtils.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_XNNPACK_OPS_OPERATION_UTILS_H__
+#define __ONERT_BACKEND_XNNPACK_OPS_OPERATION_UTILS_H__
+
+// duplicated from cpu/ops/OperationUtils.h
+#include <ir/InternalType.h>
+#include <ir/Padding.h>
+#include <ir/DataType.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace xnnpack
+{
+namespace ops
+{
+
+using OperandType = ir::DataType;
+
+template <typename T>
+void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
+{
+  if (activation == ir::Activation::RELU)
+  {
+    *activation_min = 0;
+    *activation_max = std::numeric_limits<T>::max();
+  }
+  else if (activation == ir::Activation::RELU6)
+  {
+    *activation_min = 0;
+    *activation_max = 6;
+  }
+  else if (activation == ir::Activation::RELU1)
+  {
+    *activation_min = -1;
+    *activation_max = 1;
+  }
+  else if (activation == ir::Activation::SIGMOID)
+  {
+    *activation_min = 0;
+    *activation_max = 1;
+  }
+  else if (activation == ir::Activation::NONE)
+  {
+    *activation_min = std::numeric_limits<T>::lowest();
+    *activation_max = std::numeric_limits<T>::max();
+  }
+  else
+  {
+    throw std::runtime_error{"Unsupported fused activation function"};
+  }
+}
+
+} // namespace ops
+} // namespace xnnpack
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_XNNPACK_OPS_OPERATION_UTILS_H__
diff --git a/runtime/onert/backend/xnnpack/xnnpack.cc b/runtime/onert/backend/xnnpack/xnnpack.cc
new file mode 100644
index 0000000..38a6c55
--- /dev/null
+++ b/runtime/onert/backend/xnnpack/xnnpack.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Backend.h"
+
+#include <util/logging.h>
+
+extern "C" {
+onert::backend::Backend *onert_backend_create()
+{
+  VERBOSE(onert_backend_create) << "'xnnpack' loaded\n";
+  return new onert::backend::xnnpack::Backend;
+}
+
+void onert_backend_destroy(onert::backend::Backend *backend)
+{
+  VERBOSE(onert_backend_create) << "'xnnpack' unloaded\n";
+  delete backend;
+}
+}
diff --git a/runtime/onert/core/include/backend/BackendContext.h b/runtime/onert/core/include/backend/BackendContext.h
index 1eba295..4d21215 100644
--- a/runtime/onert/core/include/backend/BackendContext.h
+++ b/runtime/onert/core/include/backend/BackendContext.h
@@ -19,6 +19,8 @@
 
 #include <memory>
 #include "ir/Graph.h"
+#include "ir/LowerInfoMap.h"
+#include "exec/FunctionSequence.h"
 
 namespace onert
 {
@@ -26,12 +28,10 @@ namespace backend
 {
 
 class Backend;
-class IConstantInitializer;
-class IKernelGenerator;
-class ITensorRegister;
 struct ITensorRegistry;
-struct ITensorBuilder;
-struct IOptimizer;
+
+using FunctionMap =
+    std::vector<std::pair<ir::OpSequenceIndex, std::unique_ptr<exec::FunctionSequence>>>;
 
 class BackendContext
 {
@@ -46,15 +46,8 @@ public:
 
 public:
   BackendContext(const Backend *backend, const ir::Graph *graph,
-                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
-                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
-                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
-                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
-                 std::shared_ptr<IOptimizer> optimizer = nullptr)
-      : _backend{backend}, _graph{graph}, tensor_registry{tensor_registry},
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}, tensor_register{tensor_register}, optimizer{optimizer}
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr)
+      : _backend{backend}, _graph{graph}, tensor_registry{tensor_registry}
   {
   }
 
@@ -66,8 +59,19 @@ public:
 
   const Backend *backend() const { return _backend; }
   const ir::Graph *graph() const { return _graph; }
-  const std::vector<OperationInfo> &operation_list() { return _operation_list; }
-  const std::vector<ir::OperandIndex> &operand_list() { return _operand_list; }
+  const std::vector<OperationInfo> &operation_list() const { return _operation_list; }
+  const std::vector<ir::OperandIndex> &operand_list() const { return _operand_list; }
+
+  virtual ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &,
+                                      const ir::OpSequences &, const ir::LowerInfoMap &)
+  {
+    return nullptr;
+  }
+  virtual FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &,
+                                 const ir::OpSequences &)
+  {
+    return {};
+  }
 
 private:
   const Backend *_backend{nullptr};
@@ -77,11 +81,6 @@ private:
 
 public:
   std::shared_ptr<ITensorRegistry> tensor_registry;
-  std::shared_ptr<ITensorBuilder> tensor_builder;
-  std::shared_ptr<IConstantInitializer> constant_initializer;
-  std::shared_ptr<IKernelGenerator> kernel_gen;
-  std::shared_ptr<ITensorRegister> tensor_register;
-  std::shared_ptr<IOptimizer> optimizer;
 };
 
 using BackendContexts = std::unordered_map<const Backend *, std::unique_ptr<BackendContext>>;
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
deleted file mode 100644
index 97721cf..0000000
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_ITENSOR_BUILDER_H__
-#define __ONERT_BACKEND_ITENSOR_BUILDER_H__
-
-#include <map>
-
-#include "ir/Index.h"
-#include "ir/OperandInfo.h"
-#include "ir/Operation.h"
-#include "ir/Layout.h"
-#include "ITensor.h"
-#include "ITensorManager.h"
-#include "ITensorRegistry.h"
-#include "IDynamicTensorManager.h"
-
-namespace onert
-{
-namespace backend
-{
-
-struct ITensorBuilder
-{
-  using IterateFunction = std::function<void(const ir::OperandIndex &)>;
-
-  virtual ~ITensorBuilder(void) = default;
-
-  /**
-   * @brief Register tensor information to allocate on backend
-   *
-   * @param ind Index
-   * @param info Info
-   * @param backend_layout Backend layout
-   * @param as_const Whether this tensor is constant
-   */
-  virtual void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                                  ir::Layout backend_layout) = 0;
-
-  /**
-   * @brief Check if the tensor has been registered with @c registerTensorInfo
-   *
-   * @return true If the tensor has been registered
-   * @return false Otherwise
-   */
-  virtual bool isRegistered(const ir::OperandIndex &) const = 0;
-
-public: // methods for static tensor allocation
-  /**
-   * @brief Let the tensor builder know first use(start of lifetime) of a tensor
-   *        Must be called before calling @c prepare
-   *        Must be run up to once for each tensor before calling @c notifyLastUse
-   *        NOTE: Useful only for static models
-   */
-  virtual void notifyFirstUse(const ir::OperandIndex &) = 0;
-  /**
-   * @brief Let the tensor builder know last use(end of lifetime) of a tensor
-   *        Must be run up to once for each tensor after calling @c notifyFirstUse
-   *        NOTE: Useful only for static models
-   */
-  virtual void notifyLastUse(const ir::OperandIndex &) = 0;
-  /**
-   * @brief Prepare the tensors
-   *        Before calling this, all the tensors must be registered
-   */
-  virtual void prepare(void) = 0;
-  /**
-   * @brief Allocate the tensors
-   *        Before calling this, @c prepare must be called
-   */
-  virtual void allocate() = 0;
-  /**
-   * @brief Some actions after functions' @c IFunction::prepare method.
-   *        This is called right after each function's @c IFunction::prepare function has been
-   *        called.
-   */
-  virtual void postFunctionPrepare() = 0;
-
-public: // methods for dynamic tensor allocation
-  /**
-   * @brief Get dynamicTensorManager. If a backend does not support dynamic tensor, exception
-   *        will be thrown.
-   *
-   * @return pointer of IDynamicTensorManager object
-   *
-   * @note   Since it is a pointer, its life time is from the cration of TensorBuilder
-   *         to the end of execution
-   */
-  virtual IDynamicTensorManager *dynamicTensorManager(void) { return nullptr; }
-};
-
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_ITENSOR_BUILDER_H__
diff --git a/runtime/onert/core/include/backend/ITensorRegister.h b/runtime/onert/core/include/backend/ITensorRegister.h
deleted file mode 100644
index b8e521c..0000000
--- a/runtime/onert/core/include/backend/ITensorRegister.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_ITENSOR_REGISTER_H__
-#define __ONERT_BACKEND_ITENSOR_REGISTER_H__
-
-#include "ir/LowerInfoMap.h"
-#include "ITensorBuilder.h"
-#include "ir/Layout.h"
-#include "ir/OperandIndexSequence.h"
-#include "ir/OperandInfo.h"
-#include "ir/Operands.h"
-#include "ir/OperationVisitor.h"
-
-namespace onert
-{
-namespace backend
-{
-
-class ITensorRegister : public ir::OperationVisitor
-{
-public:
-  virtual ~ITensorRegister() = default;
-
-public:
-  void registerTensors(const ir::OpSequence &op_seq, const ir::LowerInfoMap *lower_info_map)
-  {
-    _current_op_seq_layout = op_seq.getLayout();
-    _lower_info_map = lower_info_map;
-    assert(_lower_info_map != nullptr);
-    assert(tensor_builder().get() != nullptr);
-    op_seq.accept(*this);
-  }
-
-protected:
-  virtual const ir::Operands &operands() const = 0;
-  virtual std::shared_ptr<ITensorBuilder> tensor_builder() const = 0;
-
-protected:
-#define OP(InternalName)                                                                   \
-  void visit(const ir::operation::InternalName &node) override                             \
-  {                                                                                        \
-    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs()) \
-    {                                                                                      \
-      defaultRegisterTensorInfo(ind);                                                      \
-    }                                                                                      \
-  }
-#include "ir/Operations.lst"
-#undef OP
-
-protected:
-  void defaultRegisterTensorInfo(const ir::OperandIndex &index) const
-  {
-    if (tensor_builder()->isRegistered(index))
-    {
-      return;
-    }
-
-    const auto &obj = operands().at(index);
-    const auto frontend_layout = frontendLayout();
-    const auto backend_layout = backendLayout(index);
-    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-    tensor_builder()->registerTensorInfo(index, backend_info, backend_layout);
-  }
-
-protected:
-  ir::Layout frontendLayout() const { return _current_op_seq_layout; }
-  ir::Layout backendLayout(const ir::OperandIndex &index) const
-  {
-    assert(_lower_info_map != nullptr);
-    const auto lower_info = _lower_info_map->operand.at(index).get();
-    return lower_info->def_factors().getOnlyElement().layout();
-  }
-
-private:
-  ir::Layout _current_op_seq_layout;
-  const ir::LowerInfoMap *_lower_info_map{nullptr};
-};
-
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_ITENSOR_REGISTER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h b/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h
new file mode 100644
index 0000000..19e7b7c
--- /dev/null
+++ b/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
+#define __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
+
+#include <vector>
+
+#include "ir/Index.h"
+#include "ir/OpSequences.h"
+#include "ir/LowerInfoMap.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu_common
+{
+
+// TODO Remove the template param BackendContext once unification of cpu backend context is done
+template <typename T_BackendContext>
+void planTensors(const T_BackendContext &ctx, const std::vector<onert::ir::OpSequenceIndex> &order,
+                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
+{
+  auto graph = ctx.graph();
+  auto tensor_builder = ctx.tensor_builder;
+
+  ir::OperandIndexMap<uint32_t> uses_map;
+  ir::OperandIndexMap<uint32_t> def_map;
+  ir::OperandIndexSequence constants;
+
+  auto model_io =
+      (graph->getInputs() + graph->getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+  // Prepare scanning
+  for (auto ind : ctx.operand_list())
+  {
+    if (model_io.contains(ind))
+      continue;
+    const auto &obj = graph->operands().at(ind);
+    const auto &li = lower_info.operand.at(ind);
+    if (li->def_factors().getOnlyElement().backend() != ctx.backend())
+      continue;
+
+    // Ignore unused tensor
+    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
+    {
+      VERBOSE_F() << "Operand #" << ind.value() << " will not be used. no more process."
+                  << std::endl;
+      return;
+    }
+
+    uses_map[ind] = obj.getUses().size();
+    def_map[ind] = obj.getDef().valid() ? 1 : 0;
+
+    if (obj.isConstant())
+      constants.append(ind);
+
+    auto factor = li->def_factors().getOnlyElement();
+    if (!tensor_builder->isRegistered(ind))
+    {
+      // These tensors do not exist in any op_seq (No use and def)
+      const auto info = obj.info();
+      const auto backend_layout = factor.layout();
+      // TODO Change tensor info to have permuted shape
+      tensor_builder->registerTensorInfo(ind, info, backend_layout);
+    }
+  }
+
+  // Start scanning to do notify{First|Last}Use for each tensor
+
+  // If a tensor is a constant, increase the use of the tensor and allocate it first.
+  // Increasing use count here makes the tensor never be deallocated, i.e it they will be
+  // deallocated last.
+  for (const auto &ind : constants)
+  {
+    uses_map[ind]++;
+    tensor_builder->notifyFirstUse(ind);
+  }
+
+  // At each operation,
+  // 1. Scan DEF of outputs. If the DEF, allocate it
+  // 2. Scan DEF of inputs. If variable tensor, allocate it
+  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+  for (const auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    for (const auto &op_idx : op_seq.operations())
+    {
+      auto op_inputs = graph->operations().at(op_idx).getInputs() | ir::Remove::DUPLICATED |
+                       ir::Remove::UNDEFINED;
+      auto op_outputs = graph->operations().at(op_idx).getOutputs() | ir::Remove::DUPLICATED |
+                        ir::Remove::UNDEFINED;
+
+      // Define outputs
+      for (const auto &ind : op_outputs)
+      {
+        if (model_io.contains(ind))
+          continue;
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(def_map.find(ind) != def_map.end());
+        if (def_map[ind])
+        {
+          def_map[ind] = 0;
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      // Scan variable tensors
+      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+      // non-constant because of less memory usage by memory planning in here
+      for (const auto &ind : op_inputs)
+      {
+        if (model_io.contains(ind))
+          continue;
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        const auto &operand = graph->operands().at(ind);
+        if (operand.info().isVariable())
+        {
+          // The variable tensor with buffer is not supported yet
+          assert(operand.data() == nullptr);
+          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
+                 lower_info.operand.at(ind)->use_factors().size() == 1);
+          assert(uses_map[ind] == 1 && def_map[ind] == 0);
+          tensor_builder->notifyFirstUse(ind);
+        }
+      }
+
+      for (const auto &ind : op_inputs)
+      {
+        if (model_io.contains(ind))
+          continue;
+        if (!tensor_builder->isRegistered(ind))
+          continue;
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0)
+        {
+          // plan for deallocation of static tensornode
+          tensor_builder->notifyLastUse(ind);
+
+          // plan for deallocation of dynamic tensor
+          auto dyn_tensor_manager = tensor_builder->dynamicTensorManager();
+          auto *tensor = ctx.tensor_registry->getITensor(ind);
+          assert(tensor);
+          dyn_tensor_manager->planDealloc(op_idx, tensor);
+        }
+      }
+    }
+  }
+
+  // Dispose and validate
+  for (const auto &ind : constants)
+  {
+    --uses_map[ind];
+    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  }
+
+  assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+
+  assert(
+      std::all_of(def_map.begin(), def_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+}
+
+} // namespace cpu_common
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h b/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h
new file mode 100644
index 0000000..6793555
--- /dev/null
+++ b/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
+
+#include "TensorRegistry.h"
+
+#include "ConstantInitializerBase.h"
+#include <ir/Operands.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu_common
+{
+
+class ConstantInitializer : public ConstantInitializerBase
+{
+public:
+  ConstantInitializer(const ir::Operands &operands,
+                      const std::shared_ptr<ITensorRegistry> &tensor_reg);
+
+public:
+  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
+
+  // TODO: For now the only cpu backend supports constant tensor to use data from external
+  // If the other backend supports (to do this,
+  // ExternalTensor should be abstract such as IExternal, maybe),
+  // this can be an interface of cpu_common::ConstantInitializerBase
+  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
+
+private:
+  std::shared_ptr<ITensorRegistry> tensor_registry() const override { return _tensor_reg; }
+
+private:
+  std::shared_ptr<ITensorRegistry> _tensor_reg;
+};
+
+} // namespace cpu_common
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/core/include/backend/IConstantInitializer.h b/runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h
similarity index 90%
rename from runtime/onert/core/include/backend/IConstantInitializer.h
rename to runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h
index 149acec..d4c65de 100644
--- a/runtime/onert/core/include/backend/IConstantInitializer.h
+++ b/runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_ICONSTANT_INITIALIZER_H__
-#define __ONERT_BACKEND_ICONSTANT_INITIALIZER_H__
+#ifndef __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
+#define __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
 
 #include <unordered_map>
 #include <functional>
 
-#include "ITensorBuilder.h"
 #include "ir/Coordinates.h"
 #include "ir/Layout.h"
 #include "ir/Operand.h"
 #include "ir/Operands.h"
 #include "ir/OperationVisitor.h"
 #include "ir/OpSequence.h"
+#include "backend/ITensorRegistry.h"
 #include "util/logging.h"
+#include "backend/ITensorRegistry.h"
 
 namespace
 {
@@ -153,11 +154,13 @@ namespace onert
 {
 namespace backend
 {
+namespace cpu_common
+{
 
-class IConstantInitializer : public ir::OperationVisitor
+class ConstantInitializerBase : public ir::OperationVisitor
 {
 public:
-  virtual ~IConstantInitializer() = default;
+  virtual ~ConstantInitializerBase() = default;
 
 public:
   void run()
@@ -178,15 +181,15 @@ public:
   }
 
 public:
-  IConstantInitializer(const ir::Operands &operands)
-      : _operands{operands}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+  ConstantInitializerBase(const ir::Operands &operands)
+      : _operands{operands}, _current_layout{ir::Layout::UNKNOWN}
   {
   }
 
 public:
   using Initializer = std::function<void(const ir::Operand &, backend::ITensor &)>;
 
-  void setLayout(ir::Layout layout) { _current_op_seq_layout = layout; }
+  void setLayout(ir::Layout layout) { _current_layout = layout; }
 
 protected:
   virtual std::shared_ptr<ITensorRegistry> tensor_registry() const = 0;
@@ -221,10 +224,11 @@ public:
 protected:
   const ir::Operands &_operands;
   std::unordered_map<ir::OperandIndex, Initializer> _init_map;
-  ir::Layout _current_op_seq_layout; // TODO Rename this to _current_layout
+  ir::Layout _current_layout;
 };
 
+} // namespace cpu_common
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_ICONSTANT_INITIALIZER_H__
+#endif // __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
diff --git a/runtime/onert/core/include/backend/IKernelGenerator.h b/runtime/onert/core/include/backend/cpu_common/KernelGeneratorBase.h
similarity index 83%
rename from runtime/onert/core/include/backend/IKernelGenerator.h
rename to runtime/onert/core/include/backend/cpu_common/KernelGeneratorBase.h
index afc34ec..49a5897 100644
--- a/runtime/onert/core/include/backend/IKernelGenerator.h
+++ b/runtime/onert/core/include/backend/cpu_common/KernelGeneratorBase.h
@@ -14,28 +14,30 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_IKERNEL_GENERATOR_H__
-#define __ONERT_BACKEND_IKERNEL_GENERATOR_H__
+#ifndef __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
+#define __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
 
 #include <assert.h>
 #include <memory>
 #include <functional>
 
-#include "ITensorBuilder.h"
 #include "ir/OperationVisitor.h"
 #include "ir/OpSequence.h"
 #include <memory>
 #include "exec/FunctionSequence.h"
+#include "backend/ITensorRegistry.h"
 
 namespace onert
 {
 namespace backend
 {
+namespace cpu_common
+{
 
-class IKernelGenerator : public ir::OperationVisitor
+class KernelGeneratorBase : public ir::OperationVisitor
 {
 public:
-  virtual ~IKernelGenerator() = default;
+  virtual ~KernelGeneratorBase() = default;
 
   std::unique_ptr<exec::IFunction> releaseFunction()
   {
@@ -70,7 +72,8 @@ protected:
   std::unique_ptr<exec::FunctionSequence> _return_fn_seq; // TODO Extract this out
 };
 
+} // namespace cpu_common
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_IKERNEL_GENERATOR_H__
+#endif // __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index fa50b55..850bcf2 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -17,9 +17,11 @@
 #ifndef __ONERT_BACKEND_CPU_COMMON_STATICTENSOR_MANAGER_H__
 #define __ONERT_BACKEND_CPU_COMMON_STATICTENSOR_MANAGER_H__
 
-#include "MemoryManager.h"
-
 #include "backend/IStaticTensorManager.h"
+#include "backend/cpu_common/DynamicTensorManager.h"
+#include "backend/cpu_common/MemoryManager.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/ITensorManager.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -37,12 +39,10 @@ class StaticTensorManager : public backend::IStaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                      DynamicMemoryManager *dynamic_mem_mgr);
+                      DynamicTensorManager *dynamic_tensor_manager);
   virtual ~StaticTensorManager() = default;
 
-  void allocateConsts(void);
   void allocateNonconsts(void);
-  void deallocateConsts(void);
   void deallocateNonconsts(void);
 
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
@@ -54,11 +54,10 @@ public:
   void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
 
 private:
-  std::unique_ptr<DynamicMemoryManager> _const_mgr;
   std::unique_ptr<MemoryManager> _nonconst_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
   ir::OperandIndexMap<bool> _as_constants;
-  DynamicMemoryManager *_dynamic_mem_mgr;
+  DynamicTensorManager *_dynamic_tensor_manager;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h
index 5fa20e1..5fbf4e7 100644
--- a/runtime/onert/core/include/backend/cpu_common/Tensor.h
+++ b/runtime/onert/core/include/backend/cpu_common/Tensor.h
@@ -21,6 +21,7 @@
 
 #include <backend/IPortableTensor.h>
 #include <ir/OperandInfo.h>
+#include <ir/Data.h>
 
 namespace onert
 {
@@ -177,6 +178,91 @@ private:
   std::shared_ptr<Allocator> _allocator;
 };
 
+/**
+ * @brief Class that uses data from external memory that is not managed by a backend
+ *        instead of allocating and copying the data. ExternalTensor's data pointer points to
+ *        an address of memory such as where memory is already allocated, or mmapped area.
+ *        This is meaning that ExternalTensor can take all of types' ir::Data.
+ *        To support this, assume below things no padding, always NHWC layout,
+ *        constant tensor and not dynamic.
+ */
+class ExternalTensor : public Tensor
+{
+public:
+  ExternalTensor() = delete;
+  virtual ~ExternalTensor();
+
+public:
+  ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
+      : Tensor(info, layout, nullptr)
+  {
+    assert(_layout == ir::Layout::NHWC);
+    assert(_info.isConstant());
+    assert(_info.isDynamic() == false);
+  }
+
+public:
+  /**
+   * @brief     set Data to be shared from external so that this ExternalTensor will not be
+   *            allocated on CPU backend
+   * @param[in] data    data of Operand to be set
+   */
+  void setData(const std::shared_ptr<ir::Data> data)
+  {
+    assert(data != nullptr);
+    _data = data;
+    // Note. Some op such as cker::Conv could take buffer as nullptr.
+    // That's why _buffer also would be used
+    _buffer = const_cast<uint8_t *>(_data->base());
+  }
+
+public:
+  uint8_t *buffer() const override { return _buffer; }
+
+  bool is_constant() const override { return true; }
+  bool is_dynamic() const override { return false; }
+  void set_dynamic() override
+  {
+    throw std::runtime_error("This tensor does not support changing dynamic");
+  }
+
+  void setShape(const ir::Shape &) override
+  {
+    throw std::runtime_error("This tensor does not support changing shape");
+  }
+
+  void increase_ref() override { ++_num_references; }
+
+  void decrease_ref() override
+  {
+    assert(_data != nullptr);
+    assert(_num_references > 0);
+    --_num_references;
+    if (_num_references == 0)
+    {
+      _data.reset();
+      _buffer = nullptr;
+    }
+  }
+
+  /**
+   * @brief Reset reference count to zero and release data
+   */
+  void reset_ref() override
+  {
+    assert(_data != nullptr);
+    assert(_num_references > 0);
+    _num_references = 0;
+
+    _data.reset();
+    _buffer = nullptr;
+  }
+
+  int32_t num_references() override { return _num_references; }
+
+private:
+  std::shared_ptr<const ir::Data> _data;
+};
 } // namespace cpu_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/include/compiler/BackendManager.h b/runtime/onert/core/include/compiler/BackendManager.h
index af13d13..7850e21 100644
--- a/runtime/onert/core/include/compiler/BackendManager.h
+++ b/runtime/onert/core/include/compiler/BackendManager.h
@@ -34,7 +34,7 @@ class BackendManager
 public:
   using backend_create_t = backend::Backend *(*)();
   using backend_destroy_t = void (*)(backend::Backend *);
-  using dlhandle_destroy_t = void (*)(void *);
+  using dlhandle_destroy_t = std::function<void(void *)>;
 
   static BackendManager &get();
 
diff --git a/runtime/onert/core/include/compiler/Compiler.h b/runtime/onert/core/include/compiler/Compiler.h
index 3098be7..68b862d 100644
--- a/runtime/onert/core/include/compiler/Compiler.h
+++ b/runtime/onert/core/include/compiler/Compiler.h
@@ -24,6 +24,7 @@
 
 #include "ir/Graph.h"
 #include "exec/IExecutor.h"
+#include "util/TracingCtx.h"
 
 namespace onert
 {
@@ -48,7 +49,6 @@ struct CompilerOptions
 {
   // GENERAL OPTIONS
   std::vector<std::string> backend_list;
-  bool is_primary_subgraph; // TODO Remove this out of this struct as it is not user-given option
 
   // OPTIONS ONLY FOR DEBUGGING/PROFILING
   std::string trace_filepath; //< File path to save trace records
@@ -60,6 +60,8 @@ struct CompilerOptions
   bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF
   bool disable_compile;   //< Run with Interpreter if true, try compilation otherwise
   bool fp16_enable;       //< Whether fp16 mode ON/OFF
+
+  util::TracingCtx *tracing_ctx; //< Profiling information
 };
 
 CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs);
@@ -73,8 +75,9 @@ public:
   /**
    * @brief     Construct a new Compiler object
    * @param[in] subgs All subgraphs of a model
+   * @param[in] tracing_ctx Profiling information
    */
-  Compiler(const std::shared_ptr<ir::Subgraphs> &subgs);
+  Compiler(const std::shared_ptr<ir::Subgraphs> &subgs, util::TracingCtx *tracing_ctx);
 
 public:
   /**
diff --git a/runtime/onert/core/include/compiler/LoweredGraph.h b/runtime/onert/core/include/compiler/LoweredGraph.h
index aadba68..f115ab9 100644
--- a/runtime/onert/core/include/compiler/LoweredGraph.h
+++ b/runtime/onert/core/include/compiler/LoweredGraph.h
@@ -67,8 +67,7 @@ private:
                   const compiler::BackendResolver &backend_resolver);
 
   void manipulateLowerInfo(
-      ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info,
-      bool is_primary);
+      ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info);
   void dumpLowerInfo();
   bool mergeable(const ir::OpSequenceIndex &op_seq_index, const ir::OperationIndex &node_index,
                  ir::Layout layout, const compiler::BackendResolver &backend_resolver);
diff --git a/runtime/onert/core/include/compiler/StaticShapeInferer.h b/runtime/onert/core/include/compiler/StaticShapeInferer.h
index 05f2679..33a2f62 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInferer.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInferer.h
@@ -68,7 +68,7 @@ private:
 
 private:
   // TODO Define visitors for operations. List them in alphabetic order.
-  void visit(const ir::operation::ArgMax &op) override;
+  void visit(const ir::operation::ArgMinMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
   void visit(const ir::operation::BCQFullyConnected &op) override;
   void visit(const ir::operation::BCQGather &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInferer.h b/runtime/onert/core/include/exec/DynamicShapeInferer.h
index d2eb831..1f3a13b 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInferer.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInferer.h
@@ -49,7 +49,7 @@ public:
 public:
   // TODO Define visitors for operations. List them in alphabetic order.
   // Remove TODO when any op starting from the alphabet is added
-  void visit(const ir::operation::ArgMax &op) override;
+  void visit(const ir::operation::ArgMinMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
   void visit(const ir::operation::BCQFullyConnected &op) override;
   void visit(const ir::operation::BCQGather &op) override;
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index 1d2831d..345bec8 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -18,17 +18,32 @@
  * @file  IExecutor.h
  * @brief This file defines interface of Executor
  */
-#ifndef __ONERT_EXEC_I_EXECUTOR_H_
-#define __ONERT_EXEC_I_EXECUTOR_H_
+#ifndef __ONERT_EXEC_I_EXECUTOR_H__
+#define __ONERT_EXEC_I_EXECUTOR_H__
 
 #include "ir/Graph.h"
 #include "IFunction.h"
 #include "IODescription.h"
+#include "ir/Index.h"
 #include "ir/OperationIndexMap.h"
-#include "backend/IDynamicTensorManager.h"
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
 
 namespace onert
 {
+namespace backend
+{
+class IPortableTensor;
+namespace controlflow
+{
+class IOTensor;
+}
+}
+}
+namespace onert
+{
 namespace exec
 {
 class IExecutionObserver;
@@ -60,11 +75,29 @@ struct IExecutor
   virtual void setIndexedRanks(std::shared_ptr<ir::OperationIndexMap<int64_t>>) = 0;
 
   /**
-   * @brief     Start execution
+   * @brief     Execute with user-given input/output description (for primary subgraph)
    * @param[in] desc Input and output description
    * @note      This method should be thread-safe
    */
   virtual void execute(const IODescription &desc) = 0;
+
+  /**
+   * @brief Execute with given input/output tensors
+   *
+   * For non-primary subgraphs, input and output tensors must be given.
+   *
+   * @param[in] inputs tensors that are passed as inputs
+   * @param[in] outputs tensors that are passed as outputs
+   */
+  virtual void execute(const std::vector<backend::IPortableTensor *> &inputs,
+                       const std::vector<backend::IPortableTensor *> &outputs) = 0;
+
+  /**
+   * @brief Get output tensor objects
+   *
+   * @return Vector of @c IOTensor
+   */
+  virtual const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const = 0;
 };
 
 using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>>;
@@ -72,4 +105,4 @@ using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecu
 } // namespace exec
 } // namespace onert
 
-#endif // __ONERT_EXEC_I_EXECUTOR_H_
+#endif // __ONERT_EXEC_I_EXECUTOR_H__
diff --git a/runtime/onert/core/include/ir/DataType.h b/runtime/onert/core/include/ir/DataType.h
index 9f09de3..e77c308 100644
--- a/runtime/onert/core/include/ir/DataType.h
+++ b/runtime/onert/core/include/ir/DataType.h
@@ -37,6 +37,7 @@ enum class DataType
   INT64 = 8,
   QUANT_INT8_ASYMM = 9,
   QUANT_INT16_ASYMM = 10,
+  QUANT_INT8_SYMM_PER_CHANNEL = 11,
 };
 
 size_t sizeOfDataType(DataType data_type);
diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
index 1f20ee6..45fadc4 100644
--- a/runtime/onert/core/include/ir/Operations.Include.h
+++ b/runtime/onert/core/include/ir/Operations.Include.h
@@ -17,69 +17,69 @@
 // This file has no ifdef guard intentionally
 
 #include "ir/operation/AddN.h"
+#include "ir/operation/ArgMinMax.h"
+#include "ir/operation/BatchMatMul.h"
 #include "ir/operation/BatchToSpaceND.h"
+#include "ir/operation/BCQFullyConnected.h"
+#include "ir/operation/BCQGather.h"
 #include "ir/operation/BinaryArithmetic.h"
 #include "ir/operation/BroadcastTo.h"
-#include "ir/operation/Conv2D.h"
-#include "ir/operation/Pool2D.h"
+#include "ir/operation/Comparison.h"
 #include "ir/operation/Concat.h"
-#include "ir/operation/Reshape.h"
-#include "ir/operation/Fill.h"
-#include "ir/operation/FullyConnected.h"
-#include "ir/operation/Softmax.h"
-#include "ir/operation/Transpose.h"
-#include "ir/operation/Permute.h"
-#include "ir/operation/Reduce.h"
+#include "ir/operation/Conv2D.h"
+#include "ir/operation/ConvertFp16ToFp32.h"
+#include "ir/operation/ConvertFp32ToFp16.h"
+#include "ir/operation/Custom.h"
+#include "ir/operation/DepthToSpace.h"
 #include "ir/operation/DepthwiseConv2D.h"
-#include "ir/operation/Slice.h"
-#include "ir/operation/StridedSlice.h"
-#include "ir/operation/Squeeze.h"
+#include "ir/operation/Einsum.h"
 #include "ir/operation/ElementwiseActivation.h"
 #include "ir/operation/ElementwiseBinary.h"
 #include "ir/operation/ElementwiseUnary.h"
+#include "ir/operation/EmbeddingLookup.h"
 #include "ir/operation/ExpandDims.h"
-#include "ir/operation/Comparison.h"
+#include "ir/operation/Fill.h"
+#include "ir/operation/FullyConnected.h"
+#include "ir/operation/FusedBatchNorm.h"
+#include "ir/operation/Gather.h"
+#include "ir/operation/HashtableLookup.h"
+#include "ir/operation/If.h"
+#include "ir/operation/InstanceNorm.h"
+#include "ir/operation/L2Normalization.h"
+#include "ir/operation/LocalResponseNormalization.h"
+#include "ir/operation/LogSoftmax.h"
 #include "ir/operation/LSTM.h"
+#include "ir/operation/MatrixBandPart.h"
+#include "ir/operation/OneHot.h"
+#include "ir/operation/Pack.h"
+#include "ir/operation/Pad.h"
+#include "ir/operation/Permute.h"
+#include "ir/operation/Pool2D.h"
+#include "ir/operation/Pow.h"
+#include "ir/operation/PReLU.h"
+#include "ir/operation/Range.h"
+#include "ir/operation/Rank.h"
+#include "ir/operation/Reduce.h"
+#include "ir/operation/Reshape.h"
 #include "ir/operation/ResizeBilinear.h"
 #include "ir/operation/ResizeNearestNeighbor.h"
 #include "ir/operation/Reverse.h"
 #include "ir/operation/RNN.h"
+#include "ir/operation/Select.h"
+#include "ir/operation/Shape.h"
+#include "ir/operation/Slice.h"
+#include "ir/operation/Softmax.h"
 #include "ir/operation/SpaceToBatchND.h"
 #include "ir/operation/SpaceToDepth.h"
-#include "ir/operation/EmbeddingLookup.h"
-#include "ir/operation/L2Normalization.h"
-#include "ir/operation/HashtableLookup.h"
-#include "ir/operation/InstanceNorm.h"
-#include "ir/operation/PReLU.h"
-#include "ir/operation/TransposeConv.h"
-#include "ir/operation/SquaredDifference.h"
-#include "ir/operation/TopKV2.h"
-#include "ir/operation/Gather.h"
-#include "ir/operation/ArgMax.h"
-#include "ir/operation/LocalResponseNormalization.h"
-#include "ir/operation/DepthToSpace.h"
-#include "ir/operation/Pack.h"
-#include "ir/operation/Select.h"
 #include "ir/operation/Split.h"
 #include "ir/operation/SplitV.h"
+#include "ir/operation/SquaredDifference.h"
+#include "ir/operation/Squeeze.h"
+#include "ir/operation/StatelessRandomUniform.h"
+#include "ir/operation/StridedSlice.h"
+#include "ir/operation/Tile.h"
+#include "ir/operation/TopKV2.h"
+#include "ir/operation/Transpose.h"
+#include "ir/operation/TransposeConv.h"
 #include "ir/operation/Unpack.h"
-#include "ir/operation/Pad.h"
-#include "ir/operation/Custom.h"
-#include "ir/operation/Einsum.h"
-#include "ir/operation/OneHot.h"
-#include "ir/operation/Shape.h"
-#include "ir/operation/ConvertFp32ToFp16.h"
-#include "ir/operation/ConvertFp16ToFp32.h"
-#include "ir/operation/If.h"
 #include "ir/operation/While.h"
-#include "ir/operation/Pow.h"
-#include "ir/operation/Tile.h"
-#include "ir/operation/Range.h"
-#include "ir/operation/Rank.h"
-#include "ir/operation/BCQFullyConnected.h"
-#include "ir/operation/BCQGather.h"
-#include "ir/operation/MatrixBandPart.h"
-#include "ir/operation/BatchMatMul.h"
-#include "ir/operation/FusedBatchNorm.h"
-#include "ir/operation/LogSoftmax.h"
-#include "ir/operation/StatelessRandomUniform.h"
diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
index ccde4d1..7f3c40b 100644
--- a/runtime/onert/core/include/ir/Operations.lst
+++ b/runtime/onert/core/include/ir/Operations.lst
@@ -20,69 +20,69 @@
 
 // Internal Name
 OP(AddN)
+OP(ArgMinMax)
+OP(BatchMatMul)
 OP(BatchToSpaceND)
+OP(BCQFullyConnected)
+OP(BCQGather)
 OP(BinaryArithmetic)
 OP(BroadcastTo)
+OP(Comparison)
+OP(Concat)
 OP(Conv2D)
+OP(ConvertFp16ToFp32)
+OP(ConvertFp32ToFp16)
+OP(Custom)
+OP(DepthToSpace)
 OP(DepthwiseConv2D)
-OP(Pool2D)
-OP(Concat)
-OP(Fill)
-OP(FullyConnected)
-OP(Reduce)
-OP(Reshape)
-OP(Softmax)
-OP(Squeeze)
-OP(Slice)
-OP(StridedSlice)
-OP(Transpose)
+OP(Einsum)
 OP(ElementwiseActivation)
 OP(ElementwiseBinary)
 OP(ElementwiseUnary)
+OP(EmbeddingLookup)
 OP(ExpandDims)
-OP(Comparison)
+OP(Fill)
+OP(FullyConnected)
+OP(FusedBatchNorm)
+OP(Gather)
+OP(HashtableLookup)
+OP(If)
+OP(InstanceNorm)
+OP(L2Normalization)
+OP(LocalResponseNormalization)
+OP(LogSoftmax)
 OP(LSTM)
+OP(MatrixBandPart)
+OP(OneHot)
+OP(Pack)
+OP(Pad)
+OP(Permute)
+OP(Pool2D)
+OP(Pow)
+OP(PReLU)
+OP(Range)
+OP(Rank)
+OP(Reduce)
+OP(Reshape)
 OP(ResizeBilinear)
 OP(ResizeNearestNeighbor)
 OP(Reverse)
 OP(RNN)
+OP(Select)
+OP(Shape)
+OP(Slice)
+OP(Softmax)
 OP(SpaceToBatchND)
 OP(SpaceToDepth)
-OP(EmbeddingLookup)
-OP(L2Normalization)
-OP(HashtableLookup)
-OP(InstanceNorm)
-OP(PReLU)
-OP(TransposeConv)
-OP(SquaredDifference)
-OP(TopKV2)
-OP(Gather)
-OP(ArgMax)
-OP(Einsum)
-OP(LocalResponseNormalization)
-OP(DepthToSpace)
-OP(Pack)
-OP(Select)
 OP(Split)
 OP(SplitV)
+OP(SquaredDifference)
+OP(Squeeze)
+OP(StatelessRandomUniform)
+OP(StridedSlice)
+OP(Tile)
+OP(TopKV2)
+OP(Transpose)
+OP(TransposeConv)
 OP(Unpack)
-OP(Pad)
-OP(Custom)
-OP(Permute)
-OP(OneHot)
-OP(Shape)
-OP(ConvertFp32ToFp16)
-OP(ConvertFp16ToFp32)
-OP(If)
 OP(While)
-OP(Pow)
-OP(Tile)
-OP(Range)
-OP(Rank)
-OP(BCQFullyConnected)
-OP(BCQGather)
-OP(MatrixBandPart)
-OP(BatchMatMul)
-OP(FusedBatchNorm)
-OP(LogSoftmax)
-OP(StatelessRandomUniform)
diff --git a/runtime/onert/core/include/ir/Subgraphs.h b/runtime/onert/core/include/ir/Subgraphs.h
index 7b4c33b..6cb3694 100644
--- a/runtime/onert/core/include/ir/Subgraphs.h
+++ b/runtime/onert/core/include/ir/Subgraphs.h
@@ -120,7 +120,7 @@ public:
    *
    * @return count of Subgraphs
    */
-  size_t count() { return _subgraphs.size(); }
+  size_t count() const { return _subgraphs.size(); }
 
   /**
    * @brief Return the primary subgraph
diff --git a/runtime/onert/core/include/ir/operation/ArgMax.h b/runtime/onert/core/include/ir/operation/ArgMinMax.h
similarity index 73%
rename from runtime/onert/core/include/ir/operation/ArgMax.h
rename to runtime/onert/core/include/ir/operation/ArgMinMax.h
index ea7eabb..1c9fccd 100644
--- a/runtime/onert/core/include/ir/operation/ArgMax.h
+++ b/runtime/onert/core/include/ir/operation/ArgMinMax.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_OPERATION_ARG_MAX_H__
-#define __ONERT_IR_OPERATION_ARG_MAX_H__
+#ifndef __ONERT_IR_OPERATION_ARG_MIN_MAX_H__
+#define __ONERT_IR_OPERATION_ARG_MIN_MAX_H__
 
 #include "ir/Operation.h"
 
@@ -26,7 +26,7 @@ namespace ir
 namespace operation
 {
 
-class ArgMax : public Operation
+class ArgMinMax : public Operation
 {
 public:
   enum Input
@@ -38,15 +38,16 @@ public:
   struct Param
   {
     DataType output_type;
+    bool is_arg_max = true;
   };
 
 public:
-  ArgMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-         const Param &param);
+  ArgMinMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
+            const Param &param);
 
 public:
   void accept(OperationVisitor &v) const override;
-  OpCode opcode() const final { return OpCode::ArgMax; }
+  OpCode opcode() const final { return OpCode::ArgMinMax; }
 
 public:
   const Param &param() const { return _param; }
@@ -59,4 +60,4 @@ private:
 } // namespace ir
 } // namespace onert
 
-#endif // __ONERT_IR_OPERATION_ARG_MAX_H__
+#endif // __ONERT_IR_OPERATION_ARG_MIN_MAX_H__
diff --git a/runtime/onert/core/include/ir/operation/ElementwiseUnary.h b/runtime/onert/core/include/ir/operation/ElementwiseUnary.h
index c40778a..7d6cb54 100644
--- a/runtime/onert/core/include/ir/operation/ElementwiseUnary.h
+++ b/runtime/onert/core/include/ir/operation/ElementwiseUnary.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_OPERATION_ELEMENTWISEUNARY_H__
-#define __ONERT_IR_OPERATION_ELEMENTWISEUNARY_H__
+#ifndef __ONERT_IR_OPERATION_ELEMENTWISE_UNARY_H__
+#define __ONERT_IR_OPERATION_ELEMENTWISE_UNARY_H__
 
 #include "ir/Operation.h"
 
@@ -51,7 +51,7 @@ public:
     RSQRT,
     SIN,
     SQRT,
-    SQURE,
+    SQUARE,
     ZEROS_LIKE
   };
 
@@ -80,4 +80,4 @@ private:
 } // namespace ir
 } // namespace onert
 
-#endif // __ONERT_IR_OPERATION_ELEMENTWISEUNARY_H__
+#endif // __ONERT_IR_OPERATION_ELEMENTWISE_UNARY_H__
diff --git a/runtime/onert/core/include/ir/operation/Fill.h b/runtime/onert/core/include/ir/operation/Fill.h
index 524e413..b55c77a 100644
--- a/runtime/onert/core/include/ir/operation/Fill.h
+++ b/runtime/onert/core/include/ir/operation/Fill.h
@@ -31,7 +31,7 @@ class Fill : public Operation
 public:
   enum Input
   {
-    INPUT = 0,
+    SHAPE = 0,
     VALUE,
   };
 
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index 30f2110..5944f83 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -20,7 +20,7 @@
 
 //     Name                    | Type         | Default
 CONFIG(GRAPH_DOT_DUMP          , int          , "0")
-CONFIG(BACKENDS                , std::string  , "cpu;acl_cl;acl_neon;bcq") // FIXME Remove bcq
+CONFIG(BACKENDS                , std::string  , "cpu;acl_cl;acl_neon;ruy;xnnpack;bcq") // FIXME Remove bcq
 CONFIG(OP_BACKEND_ALLOPS       , std::string  , "")
 CONFIG(OP_BACKEND_MAP          , std::string  , "")
 CONFIG(DISABLE_COMPILE         , bool         , "0")
@@ -35,6 +35,7 @@ CONFIG(OP_SEQ_MAX_NODE         , int          , "0")
 CONFIG(TRACE_FILEPATH          , std::string  , "")
 CONFIG(FP16_ENABLE             , bool         , "0")
 CONFIG(RUY_THREADS             , int          , "-1")
+CONFIG(XNNPACK_THREADS         , int          , "-1")
 CONFIG(USE_MMAPED_DATA         , bool         , "0")
 
 // Auto-generate all operations
diff --git a/runtime/onert/core/include/util/ConfigSource.h b/runtime/onert/core/include/util/ConfigSource.h
index b6a8144..da8bc86 100644
--- a/runtime/onert/core/include/util/ConfigSource.h
+++ b/runtime/onert/core/include/util/ConfigSource.h
@@ -27,6 +27,7 @@ namespace util
 {
 
 void config_source(std::unique_ptr<IConfigSource> &&source);
+void config_source_ext(std::unique_ptr<IConfigSource> &&source);
 
 bool toBool(const std::string &val);
 int toInt(const std::string &val);
diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h
index 701b835..b11da90 100644
--- a/runtime/onert/core/include/util/ShapeInference.h
+++ b/runtime/onert/core/include/util/ShapeInference.h
@@ -42,7 +42,7 @@ using Shapes = std::vector<ir::Shape>;
 
 // Define shape calculation for operations. List them in alphabetic order.
 
-ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank);
+ir::Shape inferArgMinMaxShape(const ir::Shape &input_shape, int axis, int rank);
 
 ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape,
                                 const ir::operation::BatchMatMul::Param &param);
@@ -70,7 +70,7 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
 
 ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis);
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf);
+template <typename T> ir::Shape inferFillShape(const ir::Shape &fill_shape, const T *shape_buf);
 
 ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &ker_shape);
 
diff --git a/runtime/onert/core/include/util/TracingCtx.h b/runtime/onert/core/include/util/TracingCtx.h
new file mode 100644
index 0000000..a82704c
--- /dev/null
+++ b/runtime/onert/core/include/util/TracingCtx.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_UTIL_TRACING_CTX_H__
+#define __ONERT_UTIL_TRACING_CTX_H__
+
+#include "ir/Graph.h"
+#include "ir/Index.h"
+#include "ir/Subgraphs.h"
+
+#include <unordered_map>
+#include <mutex>
+
+namespace onert
+{
+namespace util
+{
+
+/**
+ * @brief Class to maintain information about profiling per session
+ */
+class TracingCtx
+{
+public:
+  /**
+   * @brief Create and store unique session id managed by this class
+   *        Note that this constructor can be called by multiple sessions running in parallely.
+   *        Use this constructor only when there is only one subgraph in a model.
+   */
+  TracingCtx(const ir::Graph *primary_subgraph)
+  {
+    decideSessionID();
+    _subgraph_indices.emplace(primary_subgraph, 0);
+  }
+
+  /**
+   * @brief Create and store unique session id managed by this class
+   *        Note that this constructor can be called by multiple sessions running in parallely.
+   */
+  TracingCtx(const onert::ir::Subgraphs *subgraphs)
+  {
+    assert(subgraphs);
+
+    decideSessionID();
+
+    auto count = subgraphs->count();
+    for (size_t i = 0; i < count; i++)
+      _subgraph_indices.emplace(subgraphs->at(onert::ir::SubgraphIndex(i)).get(), i);
+  }
+
+  uint32_t getSessionId() const { return _session_id; }
+
+  /**
+   * @brief Set subgraph index of a graph
+   */
+  void setSubgraphIndex(const ir::Graph *g, uint32_t index) { _subgraph_indices.emplace(g, index); }
+
+  /**
+   * @brief Get subgraph index of a graph.
+   */
+  ir::SubgraphIndex getSubgraphIndex(const ir::Graph *g) const { return _subgraph_indices.at(g); }
+
+private:
+  void decideSessionID()
+  {
+    std::unique_lock<std::mutex> lock{_session_id_mutex};
+
+    static uint32_t next_session_id = 0;
+    _session_id = next_session_id++;
+  }
+
+private:
+  std::unordered_map<const ir::Graph *, ir::SubgraphIndex> _subgraph_indices;
+  uint32_t _session_id;
+  static std::mutex _session_id_mutex;
+};
+
+} // namespace util
+} // namespace onert
+
+#endif // __ONERT_UTIL_TRACING_CTX_H__
diff --git a/runtime/onert/core/include/util/logging.h b/runtime/onert/core/include/util/logging.h
index 76cfb8d..65c3750 100644
--- a/runtime/onert/core/include/util/logging.h
+++ b/runtime/onert/core/include/util/logging.h
@@ -64,4 +64,11 @@ static Context &ctx = Context::get();
   if (::onert::util::logging::ctx.enabled()) \
   std::cout << "[" << __func__ << "] "
 
+#define WHEN_LOG_ENABLED(METHOD)             \
+  if (::onert::util::logging::ctx.enabled()) \
+    do                                       \
+    {                                        \
+      METHOD;                                \
+  } while (0)
+
 #endif // __ONERT_UTIL_LOGGING_H__
diff --git a/runtime/onert/core/src/backend/BackendContext.cc b/runtime/onert/core/src/backend/BackendContext.cc
index bafa36d..404c3b1 100644
--- a/runtime/onert/core/src/backend/BackendContext.cc
+++ b/runtime/onert/core/src/backend/BackendContext.cc
@@ -17,7 +17,6 @@
 #include "backend/BackendContext.h"
 
 #include "ir/Operation.h"
-#include "backend/IConstantInitializer.h"
 
 namespace onert
 {
@@ -31,25 +30,5 @@ void BackendContext::initialize(const std::vector<OperationInfo> &operation_list
   _operand_list = operand_list;
 }
 
-void BackendContext::initConsts()
-{
-  for (auto &op : _operation_list)
-  {
-    constant_initializer->setLayout(op.layout);
-    _graph->operations().at(op.index).accept(*constant_initializer);
-  }
-
-  for (auto ind : _operand_list)
-  {
-    const auto &obj = _graph->operands().at(ind);
-    if (obj.isConstant() && !constant_initializer->exist(ind))
-    {
-      constant_initializer->registerDefaultInitializer(ind, obj);
-    }
-  }
-
-  constant_initializer->run();
-}
-
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/Backend.h b/runtime/onert/core/src/backend/controlflow/Backend.h
index cc8346e..3323cf5 100644
--- a/runtime/onert/core/src/backend/controlflow/Backend.h
+++ b/runtime/onert/core/src/backend/controlflow/Backend.h
@@ -72,8 +72,6 @@ public:
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
     context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb->dynamicTensorManager(), tr,
                                                             context->external_context());
-    context->tensor_register = nullptr;
-    context->optimizer = nullptr;
     return context;
   }
 
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.cc b/runtime/onert/core/src/backend/controlflow/BackendContext.cc
new file mode 100644
index 0000000..366377e
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/BackendContext.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "KernelGenerator.h"
+#include "backend/cpu_common/BackendContextHelpers.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+void BackendContext::initConsts()
+{
+  for (auto &op : operation_list())
+  {
+    constant_initializer->setLayout(op.layout);
+    graph()->operations().at(op.index).accept(*constant_initializer);
+  }
+
+  for (auto ind : operand_list())
+  {
+    const auto &obj = graph()->operands().at(ind);
+    if (obj.isConstant() && !constant_initializer->exist(ind))
+    {
+      constant_initializer->registerDefaultInitializer(ind, obj);
+    }
+  }
+
+  constant_initializer->run();
+}
+
+ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                                            const ir::OpSequences &op_seqs,
+                                            const ir::LowerInfoMap &lower_info)
+{
+  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
+                  ir::Remove::DUPLICATED;
+  for (auto index : operand_list())
+  {
+    if (model_io.contains(index))
+      continue;
+    const auto &obj = graph()->operands().at(index);
+    const auto frontend_layout = [&]() {
+      if (obj.getUses().size() == 0)
+        return ir::Layout::UNKNOWN;
+      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
+      for (auto &operation_info : operation_list())
+      {
+        if (operation_info.index == use_op_ind)
+          return operation_info.layout;
+      }
+      return ir::Layout::UNKNOWN;
+    }();
+    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
+    if (permute_factor.backend() != backend())
+      continue;
+    const auto backend_layout = permute_factor.layout();
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
+  }
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    cpu_common::planTensors(*this, order, op_seqs, lower_info);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    for (auto ind : operand_list())
+    {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    }
+  }
+
+  tensor_builder->prepare();
+
+  return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels(const std::vector<ir::OpSequenceIndex> &order,
+                                       const ir::OpSequences &op_seqs)
+{
+  FunctionMap ret;
+
+  for (auto op_seq_ind : order)
+  {
+    const auto &op_seq = op_seqs.at(op_seq_ind);
+    bool assigned = [&]() {
+      for (auto op_info : operation_list())
+        if (op_seq.exist(op_info.index))
+          return true;
+      return false;
+    }();
+    if (!assigned)
+      continue;
+    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
+    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+  }
+
+  initConsts();
+
+  // NOTE For memory optimization, we want to free some operand data
+  for (auto ind : operand_list())
+  {
+    // TODO Remove const_cast
+    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
+    obj.releaseData();
+  }
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return ret;
+}
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.h b/runtime/onert/core/src/backend/controlflow/BackendContext.h
index 3647338..a768d5d 100644
--- a/runtime/onert/core/src/backend/controlflow/BackendContext.h
+++ b/runtime/onert/core/src/backend/controlflow/BackendContext.h
@@ -18,6 +18,9 @@
 #define __ONERT_BACKEND_CONTROLFLOW_BACKEND_CONTEXT_H__
 
 #include <backend/BackendContext.h>
+#include "TensorBuilder.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
 #include "ExternalContext.h"
 
 namespace onert
@@ -32,21 +35,36 @@ class BackendContext : public onert::backend::BackendContext
 public:
   BackendContext(const Backend *backend, const ir::Graph *graph,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
-                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
-                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
-                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
-                 std::shared_ptr<IOptimizer> optimizer = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry, tensor_builder,
-                                       constant_initializer, kernel_gen, tensor_register,
-                                       optimizer),
-        _external_context(std::make_shared<ExternalContext>())
+                 std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry),
+        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
+        kernel_gen{kernel_gen}, _external_context(std::make_shared<ExternalContext>())
   {
   }
 
+  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                              const ir::OpSequences &op_seqs,
+                              const ir::LowerInfoMap &lower_info) override;
+
+  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
+                         const ir::OpSequences &op_seqs) override;
+
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
 private:
+  void initConsts();
+  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
+                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+
+public:
+  // TODO Make it private
+  std::shared_ptr<TensorBuilder> tensor_builder;
+  std::shared_ptr<ConstantInitializer> constant_initializer;
+  std::shared_ptr<KernelGenerator> kernel_gen;
+
+private:
   // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
   //      the thread pool is also created in duplicate
   // TODO Create one ruy context for session
diff --git a/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h b/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h
index e21a8f3..ac97ef9 100644
--- a/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h
+++ b/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h
@@ -17,10 +17,7 @@
 #ifndef __ONERT_COMPILER_CONTROLFLOW_CONSTANT_INITIALIZER_H__
 #define __ONERT_COMPILER_CONTROLFLOW_CONSTANT_INITIALIZER_H__
 
-#include "TensorRegistry.h"
-
-#include <backend/IConstantInitializer.h>
-#include <ir/Operands.h>
+#include <backend/cpu_common/ConstantInitializer.h>
 
 namespace onert
 {
@@ -29,21 +26,7 @@ namespace backend
 namespace controlflow
 {
 
-class ConstantInitializer : public IConstantInitializer
-{
-public:
-  ConstantInitializer(const ir::Operands &operands,
-                      const std::shared_ptr<ITensorRegistry> &tensor_reg)
-      : IConstantInitializer{operands}, _tensor_reg{tensor_reg}
-  {
-  }
-
-private:
-  std::shared_ptr<ITensorRegistry> tensor_registry() const override { return _tensor_reg; }
-
-private:
-  std::shared_ptr<ITensorRegistry> _tensor_reg;
-};
+using ConstantInitializer = cpu_common::ConstantInitializer;
 
 } // namespace controlflow
 } // namespace backend
diff --git a/runtime/onert/core/src/backend/controlflow/ExternalContext.h b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
index 3db6829..cfb9831 100644
--- a/runtime/onert/core/src/backend/controlflow/ExternalContext.h
+++ b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
@@ -17,7 +17,6 @@
 #ifndef __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
 #define __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
 
-#include <backend/IExternalContext.h>
 #include <util/ConfigSource.h>
 
 #include <ruy/context.h>
@@ -38,7 +37,7 @@ namespace controlflow
 {
 
 // TODO Unify this with cpu::ExternalContext
-class ExternalContext : public IExternalContext
+class ExternalContext
 {
 public:
   ExternalContext() : _ruy_context(std::make_unique<ruy::Context>())
diff --git a/runtime/onert/core/src/backend/controlflow/IOTensor.cc b/runtime/onert/core/src/backend/controlflow/IOTensor.cc
new file mode 100644
index 0000000..47405ac
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/IOTensor.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "IOTensor.h"
+
+#include <assert.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+IOTensor::IOTensor(const ir::OperandInfo &info, ir::Layout layout)
+    : IPortableTensor{info}, _orig_info{info}, _orig_layout{layout}
+{
+  setUserTensor(nullptr, 0);
+}
+
+void IOTensor::setTensor(IPortableTensor *tensor)
+{
+  assert(tensor);
+  assert(tensor != this);
+  // TODO Handle when layout was changed
+  assert(tensor->layout() == _orig_layout); // Changing layout is not considered yet
+  _user_tensor.reset();
+  _tensor = tensor;
+}
+
+void IOTensor::setUserTensor(uint8_t *buffer, size_t size)
+{
+  _user_tensor = std::make_unique<UserTensor>(_orig_info, _orig_layout, buffer, size);
+  _tensor = _user_tensor.get();
+}
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/IOTensor.h b/runtime/onert/core/src/backend/controlflow/IOTensor.h
new file mode 100644
index 0000000..a7ed84b
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/IOTensor.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
+#define __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
+
+#include "backend/IPortableTensor.h"
+#include "UserTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+/**
+ * @brief Tensor object that indirects to the tensor it is pointing to.
+ *
+ * A model I/O tensor could be two types.
+ *
+ * 1. @c UserTensor, if it is the primary graph
+ * 2. Any other derivative of @c IPortableTensor from another backend, otherwise
+ *
+ * To support these, this object indirects everything to the actual tensor pointer.
+ * Exceptionally if it is UserTensor, this class creates and manages it.
+ */
+class IOTensor : public IPortableTensor
+{
+public:
+  IOTensor(const ir::OperandInfo &info, ir::Layout layout);
+
+public:
+  void setTensor(IPortableTensor *tensor);
+  void setUserTensor(uint8_t *buffer, size_t size);
+  ir::OperandInfo orig_info() const { return _orig_info; }
+  ir::Layout orig_layout() const { return _orig_layout; }
+
+public:
+  uint8_t *buffer() const override { return _tensor->buffer(); }
+  size_t total_size() const override { return _tensor->total_size(); }
+  size_t dimension(size_t index) const override { return _tensor->dimension(index); }
+  size_t num_dimensions() const override { return _tensor->num_dimensions(); }
+  size_t calcOffset(const ir::Coordinates &coords) const override
+  {
+    return _tensor->calcOffset(coords);
+  }
+  ir::Layout layout() const override { return _tensor->layout(); }
+  ir::DataType data_type() const override { return _tensor->data_type(); }
+  float data_scale() const override { return _tensor->data_scale(); }
+  int32_t data_offset() const override { return _tensor->data_offset(); }
+  bool is_dynamic() const override { return _is_dynamic || (_tensor && _tensor->is_dynamic()); }
+  void set_dynamic() override { _is_dynamic = true; }
+  ir::Shape getShape() const override { return _tensor->getShape(); }
+  void setShape(const ir::Shape &shape) override
+  {
+    // Workaround for IPortableTensor holds _info as its member
+    _info.shape(shape);
+    _tensor->setShape(shape);
+  }
+  bool is_constant() const override { return _tensor->is_constant(); }
+  bool applyShape(const ir::Shape &shape) override
+  {
+    // Workaround for IPortableTensor holds _info as its member
+    _info.shape(shape);
+    return _tensor->applyShape(shape);
+  }
+
+private:
+  const ir::OperandInfo _orig_info;
+  const ir::Layout _orig_layout;
+  bool _is_dynamic{false};
+  IPortableTensor *_tensor{nullptr};        //< The actual tensor that is indirected
+  std::unique_ptr<UserTensor> _user_tensor; //< If it is a user tensor, it is managed by this object
+};
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index 8e39ee5..2606f04 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -31,7 +31,7 @@ namespace backend
 namespace controlflow
 {
 
-KernelGenerator::KernelGenerator(const ir::Graph &graph, IDynamicTensorManager *dyn_tensor_manager,
+KernelGenerator::KernelGenerator(const ir::Graph &graph, DynamicTensorManager *dyn_tensor_manager,
                                  const std::shared_ptr<TensorRegistry> &tensor_reg,
                                  const std::shared_ptr<ExternalContext> &external_context)
     : _graph{graph}, _dyn_tensor_manager{dyn_tensor_manager}, _tensor_reg{tensor_reg},
@@ -77,18 +77,17 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto then_subg_index = node.param().then_subg_index;
   const auto else_subg_index = node.param().else_subg_index;
 
-  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::IPortableTensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_tensor = getTensor(input_index);
-
+    auto input_tensor = getPortableTensor(input_index);
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<backend::ITensor *> output_tensors;
+  std::vector<backend::IPortableTensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_tensor = getTensor(output_index);
+    auto output_tensor = getPortableTensor(output_index);
     output_tensors.emplace_back(output_tensor);
   }
 
@@ -97,8 +96,8 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto cond_tensor = input_tensors.front();
   input_tensors.erase(input_tensors.begin());
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::IfLayer>(
-      cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, then_subg_index,
-      else_subg_index, _executor_map, _external_context);
+      cond_tensor, input_tensors, output_tensors, then_subg_index, else_subg_index, _executor_map,
+      _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -124,33 +123,40 @@ void KernelGenerator::visit(const ir::operation::While &node)
 
   // This op does not support input as a constant, because controlflow backend does not have
   // TensorBuilder
-  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::IPortableTensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_tensor = getTensor(input_index);
-
+    auto input_tensor = getPortableTensor(input_index);
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<backend::ITensor *> output_tensors;
+  std::vector<backend::IPortableTensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_tensor = getTensor(output_index);
+    auto output_tensor = getPortableTensor(output_index);
     output_tensors.emplace_back(output_tensor);
   }
 
   // WhileLayer just set ExecutorMap instead of cond and body executor to avoid complexity of
   // creating executor recusively
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::WhileLayer>(
-      input_tensors, output_tensors, node.getOutputs(), _graph, cond_subg_index, body_subg_index,
-      _executor_map, _external_context);
+      input_tensors, output_tensors, cond_subg_index, body_subg_index, _executor_map,
+      _dyn_tensor_manager->dynamic_mem_mgr().get(), _external_context);
 
   _return_fn = std::move(fn);
 }
 
 backend::ITensor *KernelGenerator::getTensor(const ir::OperandIndex &index)
 {
-  backend::ITensor *ret = _tensor_registries.getITensor(index);
+  // get Tensor from all tensor registries (for Permute op)
+  auto ret = _tensor_registries.getITensor(index);
+  assert(ret != nullptr);
+  return ret;
+}
+
+backend::IPortableTensor *KernelGenerator::getPortableTensor(const ir::OperandIndex &index)
+{
+  auto ret = _tensor_reg->getPortableTensor(index);
   assert(ret != nullptr);
   return ret;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
index c2c1243..7b395d1 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
@@ -17,13 +17,12 @@
 #ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_CONTROLFLOW_KERNEL_GENERATOR_H__
 
-#include <backend/IKernelGenerator.h>
-#include <backend/ITensorBuilder.h>
 #include <exec/IExecutor.h>
 #include "ExternalContext.h"
 #include <ir/Graph.h>
 #include "TensorBuilder.h"
 #include "compiler/TensorRegistries.h"
+#include "backend/cpu_common/KernelGeneratorBase.h"
 #include "TensorRegistry.h"
 
 namespace onert
@@ -33,10 +32,10 @@ namespace backend
 namespace controlflow
 {
 
-class KernelGenerator : public IKernelGenerator
+class KernelGenerator : public cpu_common::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Graph &graph, IDynamicTensorManager *dyn_tensor_manager,
+  KernelGenerator(const ir::Graph &graph, DynamicTensorManager *dyn_tensor_manager,
                   const std::shared_ptr<TensorRegistry> &tensor_reg,
                   const std::shared_ptr<ExternalContext> &external_context);
 
@@ -50,8 +49,6 @@ public:
     _executor_map = executor_map.get();
   }
 
-  using IKernelGenerator::visit;
-
   void visit(const ir::OpSequence &) override;
   void visit(const ir::operation::If &) override;
   void visit(const ir::operation::Permute &) override;
@@ -59,10 +56,11 @@ public:
 
 private:
   backend::ITensor *getTensor(const ir::OperandIndex &index);
+  backend::IPortableTensor *getPortableTensor(const ir::OperandIndex &index);
 
 private:
   const ir::Graph &_graph;
-  IDynamicTensorManager *_dyn_tensor_manager;
+  DynamicTensorManager *_dyn_tensor_manager;
   std::shared_ptr<TensorRegistry> _tensor_reg;
   compiler::TensorRegistries _tensor_registries;
   exec::ExecutorMap *_executor_map;
diff --git a/runtime/onert/core/src/backend/controlflow/Tensor.h b/runtime/onert/core/src/backend/controlflow/Tensor.h
index ba5bafd..87951a9 100644
--- a/runtime/onert/core/src/backend/controlflow/Tensor.h
+++ b/runtime/onert/core/src/backend/controlflow/Tensor.h
@@ -27,6 +27,7 @@ namespace controlflow
 {
 
 using Tensor = cpu_common::Tensor;
+using ExternalTensor = cpu_common::ExternalTensor;
 
 } // namespace controlflow
 } // namespace backend
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index e4b0388..a767f0e 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -30,8 +30,8 @@ namespace controlflow
 TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg)
     : _tensor_reg{tensor_reg},
       _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg->base_reg())},
-      _static_tensor_mgr{new cpu_common::StaticTensorManager(
-          _tensor_reg->base_reg(), _dynamic_tensor_mgr->dynamic_mem_mgr().get())}
+      _static_tensor_mgr{
+          new cpu_common::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())}
 {
   /* empty */
 }
@@ -90,11 +90,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
 
-void TensorBuilder::prepare(void)
-{
-  _static_tensor_mgr->allocateConsts();
-  _static_tensor_mgr->allocateNonconsts();
-}
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
 
 void TensorBuilder::allocate()
 {
@@ -102,7 +98,7 @@ void TensorBuilder::allocate()
   //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
 }
 
-IDynamicTensorManager *TensorBuilder::dynamicTensorManager(void)
+DynamicTensorManager *TensorBuilder::dynamicTensorManager(void)
 {
   return _dynamic_tensor_mgr.get();
 }
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
index 6959947..d2e3076 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
@@ -21,7 +21,6 @@
 #include <backend/cpu_common/TensorRegistry.h>
 #include <backend/cpu_common/Tensor.h>
 
-#include <backend/ITensorBuilder.h>
 #include <ir/OperandIndexMap.h>
 
 #include <unordered_map>
@@ -35,7 +34,7 @@ namespace backend
 namespace controlflow
 {
 
-class TensorBuilder : public ITensorBuilder
+class TensorBuilder
 {
 public:
   TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg);
@@ -47,18 +46,18 @@ public:
    * @param[in] layout Operand data layout
    */
   void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout) override;
+                          ir::Layout backend_layout);
 
-  void notifyFirstUse(const ir::OperandIndex &) override;
-  void notifyLastUse(const ir::OperandIndex &) override;
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
 
-  bool isRegistered(const ir::OperandIndex &) const override;
+  bool isRegistered(const ir::OperandIndex &) const;
 
-  void prepare(void) override;
-  void allocate() override;
-  void postFunctionPrepare() override { /* DO NOTHING */}
+  void prepare(void);
+  void allocate();
+  void postFunctionPrepare() { /* DO NOTHING */}
 
-  IDynamicTensorManager *dynamicTensorManager(void) override;
+  DynamicTensorManager *dynamicTensorManager(void);
 
   /**
    * @brief Get tensor with a specific OperandIndex.
diff --git a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
index 94f71bb..901f0ae 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
@@ -20,7 +20,7 @@
 #include "backend/cpu_common/TensorRegistry.h"
 #include "backend/ITensorRegistry.h"
 #include "Tensor.h"
-#include "UserTensor.h"
+#include "IOTensor.h"
 #include <assert.h>
 
 namespace onert
@@ -36,9 +36,10 @@ namespace controlflow
  * This class contains three types of tensors. Two native tensors(tensors that are managed by this
  * backend) and the other is migrant tensor.
  *
- * - NativeUserTensor - @c UserTensor managed by this backend, buffer is user-given
- * - NativeOwnTensor  - @c cpu_common::Tensor managed by this backend ( in @c _base_reg )
- * - MigrantTensor    - @c IPortableTensor managed by other backends ( in @c _base_reg )
+ * - NativeIOTensor  - @c IOTensor managed by this backend ( in @c _base_reg )
+ *     - NOTE The tensor it actually points to can be from another backend
+ * - NativeOwnTensor - @c cpu_common::Tensor managed by this backend ( in @c _base_reg )
+ * - MigrantTensor   - @c IPortableTensor managed by other backends
  *
  * @note @c _base_reg is used in implementation to reuse @c cpu_common::StaticTensorManager
  *
@@ -53,7 +54,7 @@ public:
     auto base_tensor = _base_reg->getITensor(ind);
     if (base_tensor)
       return base_tensor;
-    return getNativeUserTensor(ind);
+    return getNativeIOTensor(ind);
   }
 
   ITensor *getNativeITensor(const ir::OperandIndex &ind) override
@@ -61,7 +62,7 @@ public:
     auto base_tensor = _base_reg->getNativeITensor(ind);
     if (base_tensor)
       return base_tensor;
-    return getNativeUserTensor(ind);
+    return getNativeIOTensor(ind);
   }
 
   IPortableTensor *getPortableTensor(const ir::OperandIndex &ind)
@@ -69,7 +70,7 @@ public:
     auto base_tensor = _base_reg->getPortableTensor(ind);
     if (base_tensor)
       return base_tensor;
-    return getNativeUserTensor(ind);
+    return getNativeIOTensor(ind);
   }
 
   IPortableTensor *getNativeTensor(const ir::OperandIndex &ind)
@@ -77,7 +78,7 @@ public:
     auto base_tensor = _base_reg->getNativeTensor(ind);
     if (base_tensor)
       return base_tensor;
-    return getNativeUserTensor(ind);
+    return getNativeIOTensor(ind);
   }
 
   Tensor *getNativeOwnTensor(const ir::OperandIndex &ind)
@@ -85,10 +86,10 @@ public:
     return _base_reg->getNativeTensor(ind);
   }
 
-  UserTensor *getNativeUserTensor(const ir::OperandIndex &ind)
+  IOTensor *getNativeIOTensor(const ir::OperandIndex &ind)
   {
-    auto tensor = _native_user_tensors.find(ind);
-    if (tensor != _native_user_tensors.end())
+    auto tensor = _native_io_tensors.find(ind);
+    if (tensor != _native_io_tensors.end())
       return tensor->second.get();
     return nullptr;
   }
@@ -108,22 +109,22 @@ public:
     _base_reg->setNativeTensor(ind, std::move(tensor));
   }
 
-  void setNativeUserTensor(ir::OperandIndex ind, std::unique_ptr<UserTensor> &&tensor)
+  void setNativeIOTensor(ir::OperandIndex ind, std::unique_ptr<IOTensor> &&tensor)
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
-    _native_user_tensors[ind] = std::move(tensor);
+    _native_io_tensors[ind] = std::move(tensor);
   }
 
-  const ir::OperandIndexMap<std::unique_ptr<UserTensor>> &native_user_tensors()
+  const ir::OperandIndexMap<std::unique_ptr<IOTensor>> &native_io_tensors()
   {
-    return _native_user_tensors;
+    return _native_io_tensors;
   }
   std::shared_ptr<cpu_common::TensorRegistry> base_reg() { return _base_reg; }
 
 private:
   std::shared_ptr<cpu_common::TensorRegistry> _base_reg;
-  ir::OperandIndexMap<std::unique_ptr<UserTensor>> _native_user_tensors;
+  ir::OperandIndexMap<std::unique_ptr<IOTensor>> _native_io_tensors;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
index de91b85..1d786c4 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
@@ -18,7 +18,6 @@
 
 #include <backend/ITensor.h>
 #include "exec/ExecutorBase.h"
-#include <misc/polymorphic_downcast.h>
 #include "PermuteLayer.h"
 
 namespace onert
@@ -30,16 +29,15 @@ namespace controlflow
 namespace kernel
 {
 
-IfLayer::IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
-                 const std::vector<backend::ITensor *> output_tensors,
-                 const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
+IfLayer::IfLayer(backend::IPortableTensor *cond_tensor,
+                 const std::vector<backend::IPortableTensor *> input_tensors,
+                 const std::vector<backend::IPortableTensor *> output_tensors,
                  const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
                  exec::ExecutorMap *executor_map,
                  const std::shared_ptr<ExternalContext> &external_context)
     : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _output_indices{output_indices}, _graph{graph}, _then_subg_index{then_subg_index},
-      _else_subg_index{else_subg_index}, _executor_map{executor_map},
-      _external_context{external_context}
+      _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index},
+      _executor_map{executor_map}, _external_context{external_context}
 {
   // At this point, executor_map may not have executors of then subg and else subg
 }
@@ -48,79 +46,34 @@ void IfLayer::run()
 {
   // Check condition
   // // If true
-  // // // Copy _input_tensors -> then subg's inputs
-  // // // Run then subg
-  // // // Copy outputs of then subg -> _output_tensors
+  // // // Set _input_tensors -> then-subg's inputs
+  // // // Set outputs of then-subg -> _output_tensors
+  // // // Run then-subg
   // // Else
-  // // // Copy _input_tensors -> else subg's inputs if false
-  // // // Run else subg
-  // // // Copy outputs of else subg -> _output_tensors
-  auto getResultCond = [](backend::ITensor *tensor) -> bool {
+  // // // Set _input_tensors -> else-subg's inputs
+  // // // Set outputs of else-subg -> _output_tensors
+  // // // Run else-subg
+
+  auto getResultCond = [](backend::IPortableTensor *tensor) -> bool {
     bool ret = false;
     tensor->access([&](ITensor &tensor) { ret = *reinterpret_cast<bool *>(tensor.buffer()); });
     return ret;
   };
 
-  exec::ExecutorBase *subg_exec = nullptr;
+  exec::IExecutor *subg_exec = nullptr;
   bool cond_result = getResultCond(_cond_tensor);
   if (cond_result)
   {
     VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl;
-    subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
-        _executor_map->at(_then_subg_index).get());
+    subg_exec = _executor_map->at(_then_subg_index).get();
   }
   else
   {
     VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl;
-    subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
-        _executor_map->at(_else_subg_index).get());
-  }
-
-  const auto &subg_graph = subg_exec->graph();
-
-  std::vector<backend::ITensor *> src_tensors;
-  std::vector<backend::ITensor *> dst_tensors;
-  // Add tensors used in subgraph or contained in outputs of subgraph
-  assert(subg_graph.getInputs().size() == _input_tensors.size());
-  assert(subg_graph.getInputs().size() == subg_exec->getInputTensors().size());
-  for (uint32_t i = 0; i < subg_graph.getInputs().size(); ++i)
-  {
-    const auto &subg_input_index = subg_graph.getInputs().at(i);
-    const auto &subg_input = subg_graph.operands().at(subg_input_index);
-    if (subg_input.getUses().size() > 0 || subg_graph.getOutputs().contains(subg_input_index))
-    {
-      src_tensors.emplace_back(_input_tensors.at(i));
-      dst_tensors.emplace_back(subg_exec->getInputTensors().at(i));
-    }
+    subg_exec = _executor_map->at(_else_subg_index).get();
   }
-  const auto permute_op_input_to_subg_input =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, _external_context);
-
-  // Add tensors used as output of operation or contained in outputs of operation
-  src_tensors.clear();
-  dst_tensors.clear();
-  assert(_output_indices.size() == subg_exec->getOutputTensors().size());
-  assert(_output_indices.size() == _output_tensors.size());
-  for (uint32_t i = 0; i < _output_indices.size(); ++i)
-  {
-    const auto &output_index = _output_indices.at(i);
-    const auto &output = _graph.operands().at(output_index);
-    if (output.getUses().size() > 0 || _graph.getOutputs().contains(output_index))
-    {
-      src_tensors.emplace_back(subg_exec->getOutputTensors().at(i));
-      dst_tensors.emplace_back(_output_tensors.at(i));
-    }
-  }
-  const auto permute_subg_output_to_op_output =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, _external_context);
-
-  // Remove copying of unused tensor
-  permute_op_input_to_subg_input->prepare();
-  permute_subg_output_to_op_output->prepare();
 
-  // Copy & run
-  subg_exec->execute(_input_tensors, permute_op_input_to_subg_input);
-  permute_subg_output_to_op_output->run();
+  subg_exec->execute(_input_tensors, _output_tensors);
   VERBOSE(If) << "Return from $" << (cond_result ? _then_subg_index : _else_subg_index)
               << std::endl;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
index 9e944bc..967552f 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_IF_LAYER_H__
 #define __ONERT_BACKEND_CONTROLFLOW_KERNEL_IF_LAYER_H__
 
-#include <backend/ITensor.h>
+#include <backend/IPortableTensor.h>
 #include <exec/IExecutor.h>
 #include "../ExternalContext.h"
 
@@ -33,9 +33,9 @@ namespace kernel
 class IfLayer : public ::onert::exec::IFunction
 {
 public:
-  IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
-          const std::vector<backend::ITensor *> output_tensors,
-          const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
+  IfLayer(backend::IPortableTensor *cond_tensor,
+          const std::vector<backend::IPortableTensor *> input_tensors,
+          const std::vector<backend::IPortableTensor *> output_tensors,
           const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
           exec::ExecutorMap *executor_map,
           const std::shared_ptr<ExternalContext> &external_context);
@@ -44,11 +44,9 @@ public:
   void run() override;
 
 private:
-  backend::ITensor *_cond_tensor;
-  const std::vector<backend::ITensor *> _input_tensors;
-  const std::vector<backend::ITensor *> _output_tensors;
-  const ir::OperandIndexSequence &_output_indices;
-  const ir::Graph &_graph;
+  backend::IPortableTensor *_cond_tensor;
+  const std::vector<backend::IPortableTensor *> _input_tensors;
+  const std::vector<backend::IPortableTensor *> _output_tensors;
   const ir::SubgraphIndex _then_subg_index;
   const ir::SubgraphIndex _else_subg_index;
   exec::ExecutorMap *_executor_map;
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
index 5d0f191..6fb69b6 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
@@ -17,7 +17,6 @@
 #ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_PERMUTELAYER_H__
 #define __ONERT_BACKEND_CONTROLFLOW_KERNEL_PERMUTELAYER_H__
 
-#include "backend/ITensorBuilder.h"
 #include "exec/IPermuteFunction.h"
 #include "exec/IExecutor.h"
 #include "../ExternalContext.h"
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
index a0d4786..a4b5aa5 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
@@ -16,6 +16,7 @@
 
 #include "WhileLayer.h"
 
+#include <algorithm>
 #include <backend/ITensor.h>
 #include "exec/ExecutorBase.h"
 #include <misc/polymorphic_downcast.h>
@@ -30,16 +31,15 @@ namespace controlflow
 namespace kernel
 {
 
-WhileLayer::WhileLayer(const std::vector<backend::ITensor *> input_tensors,
-                       const std::vector<backend::ITensor *> output_tensors,
-                       const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
+WhileLayer::WhileLayer(const std::vector<backend::IPortableTensor *> input_tensors,
+                       const std::vector<backend::IPortableTensor *> output_tensors,
                        const ir::SubgraphIndex &cond_subg_index,
                        const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map,
+                       cpu_common::DynamicMemoryManager *dyn_memory_manager,
                        const std::shared_ptr<ExternalContext> &external_context)
     : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
-      _output_indices{output_indices}, _graph{graph}, _input_tensors{input_tensors},
-      _output_tensors{output_tensors}, _executor_map{executor_map},
-      _external_context{external_context}
+      _input_tensors{input_tensors}, _output_tensors{output_tensors}, _executor_map{executor_map},
+      _dyn_memory_manager{dyn_memory_manager}, _external_context{external_context}
 {
   // At this point, executor_map may not have executors of cond subg and body subg
 }
@@ -56,164 +56,90 @@ void WhileLayer::run()
   // // Run cond subg
   // If there is no loop copy "_input_tensors" -> "_dst_tensors", else copy "cond subg inputs" ->
   // "_dst_tensors"
-  auto cond_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
-      _executor_map->at(_cond_subg_index).get());
-  auto body_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
-      _executor_map->at(_body_subg_index).get());
-
-  const auto &cond_graph = cond_exec->graph();
-  const auto &body_graph = body_exec->graph();
-
-  std::vector<backend::ITensor *> input_tensors;
-  std::vector<backend::ITensor *> cond_input_tensors;
-  std::vector<backend::ITensor *> body_input_tensors;
-  std::vector<backend::ITensor *> body_output_tensors;
-  std::vector<backend::ITensor *> output_tensors;
-
-  // Add only used tensors in cond subgraph
-  assert(cond_graph.getInputs().size() == _input_tensors.size());
-  assert(cond_graph.getInputs().size() == cond_exec->getInputTensors().size());
-  for (uint32_t i = 0; i < cond_graph.getInputs().size(); ++i)
-  {
-    const auto &cond_input = cond_graph.operands().at(cond_graph.getInputs().at(i));
-    if (cond_input.getUses().size() > 0)
-    {
-      input_tensors.emplace_back(_input_tensors.at(i));
-      cond_input_tensors.emplace_back(cond_exec->getInputTensors().at(i));
-    }
-  }
-  const auto permute_op_input_to_cond_input =
-      std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors, _external_context);
-
-  // Add only used tensors among outputs of while operation
-  assert(_output_indices.size() == _input_tensors.size());
-  assert(_output_indices.size() == _output_tensors.size());
-  input_tensors.clear();
-  output_tensors.clear();
-  for (size_t i = 0; i < _output_indices.size(); ++i)
-  {
-    const auto &output_index = _output_indices.at(i);
-    const auto &output = _graph.operands().at(output_index);
-    if (output.getUses().size() > 0 || _graph.getOutputs().contains(output_index))
-    {
-      input_tensors.emplace_back(_input_tensors.at(i));
-      output_tensors.emplace_back(_output_tensors.at(i));
-    }
-  }
-  const auto permute_op_input_to_op_output =
-      std::make_shared<PermuteLayer>(input_tensors, output_tensors, _external_context);
-
-  // Add all tensors with unused tensors in body subgraph because unused input tensors will be
-  // copied output tensors in body subgraph
-  assert(_input_tensors.size() == body_exec->getInputTensors().size());
-  input_tensors = _input_tensors;
-  body_input_tensors = body_exec->getInputTensors();
-  const auto permute_op_input_to_body_input =
-      std::make_shared<PermuteLayer>(input_tensors, body_input_tensors, _external_context);
-
-  // Add only used tensors in cond subgraph
-  assert(cond_graph.getInputs().size() == body_exec->getOutputTensors().size());
-  assert(cond_graph.getInputs().size() == cond_exec->getInputTensors().size());
-  body_output_tensors.clear();
-  cond_input_tensors.clear();
-  for (uint32_t i = 0; i < cond_graph.getInputs().size(); ++i)
-  {
-    const auto &cond_input = cond_graph.operands().at(cond_graph.getInputs().at(i));
-    if (cond_input.getUses().size() > 0)
-    {
-      body_output_tensors.emplace_back(body_exec->getOutputTensors().at(i));
-      cond_input_tensors.emplace_back(cond_exec->getInputTensors().at(i));
-    }
-  }
-  const auto permute_body_output_to_cond_input =
-      std::make_shared<PermuteLayer>(body_output_tensors, cond_input_tensors, _external_context);
-
-  // Add only used tensors in body subgraph
-  assert(body_graph.getInputs().size() == body_exec->getOutputTensors().size());
-  assert(body_graph.getInputs().size() == body_exec->getInputTensors().size());
-  body_output_tensors.clear();
-  body_input_tensors.clear();
-  for (uint32_t i = 0; i < body_graph.getInputs().size(); ++i)
-  {
-    const auto &body_input_index = body_graph.getInputs().at(i);
-    const auto &body_input = body_graph.operands().at(body_input_index);
-    if (body_input.getUses().size() > 0 &&
-        !body_exec->graph().getOutputs().contains(body_input_index))
-    {
-      body_output_tensors.emplace_back(body_exec->getOutputTensors().at(i));
-      body_input_tensors.emplace_back(body_exec->getInputTensors().at(i));
-    }
-  }
-  const auto permute_body_output_to_body_input =
-      std::make_shared<PermuteLayer>(body_output_tensors, body_input_tensors, _external_context);
-
-  // Add only used tensors among outputs of while operation
-  assert(_output_indices.size() == body_exec->getOutputTensors().size());
-  assert(_output_indices.size() == _output_tensors.size());
-  body_output_tensors.clear();
-  output_tensors.clear();
-  for (size_t i = 0; i < _output_indices.size(); ++i)
-  {
-    const auto &output_index = _output_indices.at(i);
-    const auto &output = _graph.operands().at(output_index);
-    if (output.getUses().size() > 0 || _graph.getOutputs().contains(output_index))
-    {
-      body_output_tensors.emplace_back(body_exec->getOutputTensors().at(i));
-      output_tensors.emplace_back(_output_tensors.at(i));
-    }
-  }
-  const auto permute_body_output_to_op_output =
-      std::make_shared<PermuteLayer>(body_output_tensors, output_tensors, _external_context);
+  auto cond_exec = _executor_map->at(_cond_subg_index).get();
+  auto body_exec = _executor_map->at(_body_subg_index).get();
 
-  // Remove copying of unused tensor
-  permute_op_input_to_cond_input->prepare();
-  permute_op_input_to_op_output->prepare();
-  permute_op_input_to_body_input->prepare();
-  permute_body_output_to_cond_input->prepare();
-  permute_body_output_to_body_input->prepare();
-  permute_body_output_to_op_output->prepare();
+  // Need a temp tensor to hold the cond subgraph output
+  assert(cond_exec->getOutputTensors().size() == 1);
+  auto cond_output_tensor = [&]() {
+    auto cond_output = cond_exec->getOutputTensors().at(0);
+    auto tensor = std::make_unique<Tensor>(cond_output->orig_info(), cond_output->orig_layout(),
+                                           _dyn_memory_manager);
+    tensor->set_dynamic();
+    tensor->setBuffer(_dyn_memory_manager->allocate(tensor.get(), tensor->total_size()));
+    return tensor;
+  }();
 
   VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
-  cond_exec->execute(_input_tensors, permute_op_input_to_cond_input);
+  cond_exec->execute(_input_tensors, {cond_output_tensor.get()});
   VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
 
-  assert(cond_exec->getOutputTensors().size() == 1);
-  auto &cond_output_tensor = cond_exec->getOutputTensors().at(0);
   auto getResultCond = [](backend::ITensor *tensor) -> bool {
     bool ret = false;
     tensor->access([&](ITensor &tensor) { ret = *reinterpret_cast<bool *>(tensor.buffer()); });
     return ret;
   };
 
+  std::vector<ITensor *> op_inputs(_input_tensors.begin(), _input_tensors.end());
+  std::vector<ITensor *> op_outputs(_output_tensors.begin(), _output_tensors.end());
+  // Copying body inputs to outputs when the loop body is never executed
+  if (!getResultCond(cond_output_tensor.get()))
+  {
+    PermuteLayer copy_body_inputs_to_op_outputs{op_inputs, op_outputs, _external_context};
+    copy_body_inputs_to_op_outputs.run();
+    return;
+  }
+
+  // Need some temp tensors to hold the body subgraph output
+  std::vector<std::unique_ptr<Tensor>> temp_outputs_o;
+  std::vector<IPortableTensor *> temp_outputs;
+  for (auto io_tensor : body_exec->getOutputTensors())
+  {
+    auto tensor = std::make_unique<Tensor>(io_tensor->orig_info(), io_tensor->orig_layout(),
+                                           _dyn_memory_manager);
+    tensor->set_dynamic();
+    tensor->setBuffer(_dyn_memory_manager->allocate(tensor.get(), tensor->total_size()));
+    temp_outputs.push_back(tensor.get());
+    temp_outputs_o.push_back(std::move(tensor));
+  }
+
+  std::vector<ITensor *> body_outputs(temp_outputs.begin(), temp_outputs.end());
+  PermuteLayer copy_body_outputs_to_op_outputs{body_outputs, op_outputs, _external_context};
+
   const auto body_execute_with_op_inputs = [&]() {
     VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
-    body_exec->execute(_input_tensors, permute_op_input_to_body_input);
+    body_exec->execute(_input_tensors, temp_outputs);
     VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   const auto body_execute_with_body_outputs = [&]() {
     VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
-    body_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_body_input);
+    body_exec->execute(_output_tensors, temp_outputs);
     VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   std::function<void()> body_execute = body_execute_with_op_inputs;
   const auto cond_execute = [&]() {
     VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
-    cond_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_cond_input);
+    cond_exec->execute(_output_tensors, {cond_output_tensor.get()});
     VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
   };
-  auto permute_to_outputs_fn = permute_op_input_to_op_output;
 
   // Loop while Cond subgraph's output is true
-  while (getResultCond(cond_output_tensor))
+  while (getResultCond(cond_output_tensor.get()))
   {
     body_execute();
+    copy_body_outputs_to_op_outputs.run();
     cond_execute();
     body_execute = body_execute_with_body_outputs;
-    permute_to_outputs_fn = permute_body_output_to_op_output;
   }
-  permute_to_outputs_fn->run();
+
+  // Clean-up the temp tensors
+  _dyn_memory_manager->deallocate(cond_output_tensor.get());
+  for (auto tensor : temp_outputs)
+  {
+    _dyn_memory_manager->deallocate(tensor);
+  }
 }
 
 } // namespace kernel
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
index 8f82bd9..d3924c8 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
@@ -17,13 +17,15 @@
 #ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_WHILE_LAYER_H__
 #define __ONERT_BACKEND_CONTROLFLOW_KERNEL_WHILE_LAYER_H__
 
-#include <backend/ITensor.h>
+#include <backend/IPortableTensor.h>
 #include <exec/IExecutor.h>
 #include <exec/IFunction.h>
 #include <ir/OperandIndexSequence.h>
 #include <ir/Graph.h>
 #include "../ExternalContext.h"
 
+#include "backend/cpu_common/MemoryManager.h"
+
 namespace onert
 {
 namespace backend
@@ -36,11 +38,10 @@ namespace kernel
 class WhileLayer : public ::onert::exec::IFunction
 {
 public:
-  WhileLayer(const std::vector<backend::ITensor *> input_tensors,
-             const std::vector<backend::ITensor *> output_tensors,
-             const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
+  WhileLayer(const std::vector<backend::IPortableTensor *> input_tensors,
+             const std::vector<backend::IPortableTensor *> output_tensors,
              const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index,
-             exec::ExecutorMap *executor_map,
+             exec::ExecutorMap *executor_map, cpu_common::DynamicMemoryManager *dyn_memory_manager,
              const std::shared_ptr<ExternalContext> &external_context);
 
 public:
@@ -49,11 +50,10 @@ public:
 private:
   const ir::SubgraphIndex _cond_subg_index;
   const ir::SubgraphIndex _body_subg_index;
-  const ir::OperandIndexSequence &_output_indices;
-  const ir::Graph &_graph;
-  const std::vector<backend::ITensor *> _input_tensors;
-  const std::vector<backend::ITensor *> _output_tensors;
+  const std::vector<backend::IPortableTensor *> _input_tensors;
+  const std::vector<backend::IPortableTensor *> _output_tensors;
   exec::ExecutorMap *_executor_map;
+  cpu_common::DynamicMemoryManager *_dyn_memory_manager; // For generating temp tensors
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/core/src/backend/cpu_common/BackendContextHelpers.cc b/runtime/onert/core/src/backend/cpu_common/BackendContextHelpers.cc
new file mode 100644
index 0000000..732b03c
--- /dev/null
+++ b/runtime/onert/core/src/backend/cpu_common/BackendContextHelpers.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/cpu_common/BackendContextHelpers.h"
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc
similarity index 51%
rename from runtime/onert/backend/cpu/ConstantInitializer.cc
rename to runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc
index 6f6eb77..610ba5f 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
+++ b/runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "ConstantInitializer.h"
-#include "Tensor.h"
+#include "backend/cpu_common/ConstantInitializer.h"
+#include "backend/cpu_common/Tensor.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace cpu
+namespace cpu_common
 {
 
 ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
                                          const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : IConstantInitializer{operands}, _tensor_reg{tensor_reg}
+    : ConstantInitializerBase{operands}, _tensor_reg{tensor_reg}
 {
   // DO NOTHING
 }
@@ -53,42 +53,6 @@ void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &in
   };
 }
 
-void ConstantInitializer::visit(const ir::operation::Conv2D &node)
-{
-  const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
-  const auto &kernel_obj = _operands.at(kernel_index);
-  registerExternalInitializer(kernel_index, kernel_obj);
-
-  const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
-  const auto &bias_obj = _operands.at(bias_index);
-  registerExternalInitializer(bias_index, bias_obj);
-}
-
-void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
-{
-  const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
-  const auto &kernel_obj = _operands.at(kernel_index);
-  registerExternalInitializer(kernel_index, kernel_obj);
-
-  const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
-  const auto &bias_obj = _operands.at(bias_index);
-  registerExternalInitializer(bias_index, bias_obj);
-}
-
-void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
-{
-  const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
-  const auto &weight_obj = _operands.at(weight_index);
-  registerExternalInitializer(weight_index, weight_obj);
-
-  const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
-  if (!bias_index.undefined())
-  {
-    const auto &bias_obj = _operands.at(bias_index);
-    registerExternalInitializer(bias_index, bias_obj);
-  }
-}
-
-} // namespace cpu
+} // namespace cpu_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/IConstantInitializer.cc b/runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc
similarity index 86%
rename from runtime/onert/core/src/backend/IConstantInitializer.cc
rename to runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc
index 6fb9757..15c2dfe 100644
--- a/runtime/onert/core/src/backend/IConstantInitializer.cc
+++ b/runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "backend/IConstantInitializer.h"
+#include "backend/cpu_common/ConstantInitializerBase.h"
 
 #include <Half.h>
 
@@ -24,9 +24,11 @@ namespace onert
 {
 namespace backend
 {
+namespace cpu_common
+{
 
-void IConstantInitializer::registerCopyInitializer(const ir::OperandIndex &index,
-                                                   const ir::Operand &obj)
+void ConstantInitializerBase::registerCopyInitializer(const ir::OperandIndex &index,
+                                                      const ir::Operand &obj)
 {
   // For only CONSTANTS
   // TODO Add to check if tensor has been allocated
@@ -67,8 +69,8 @@ void IConstantInitializer::registerCopyInitializer(const ir::OperandIndex &index
   }
 }
 
-void IConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &index,
-                                                      const ir::Operand &obj)
+void ConstantInitializerBase::registerPermuteInitializer(const ir::OperandIndex &index,
+                                                         const ir::Operand &obj)
 {
   // For only CONSTANTS
   // TODO Add to check if tensor has been allocated
@@ -82,27 +84,27 @@ void IConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &in
   switch (type)
   {
     case DataType::FLOAT32:
-      _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_layout);
       break;
     case DataType::INT32:
-      _init_map[index] = std::bind(permuteInit<int32_t>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<int32_t>, _1, _2, _current_layout);
       break;
     case DataType::UINT32:
-      _init_map[index] = std::bind(permuteInit<uint32_t>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<uint32_t>, _1, _2, _current_layout);
       break;
     case DataType::BOOL8:
     case DataType::QUANT_UINT8_ASYMM:
-      _init_map[index] = std::bind(permuteInit<uint8_t>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<uint8_t>, _1, _2, _current_layout);
       break;
     case DataType::QUANT_INT8_SYMM:
     case DataType::QUANT_INT8_ASYMM:
-      _init_map[index] = std::bind(permuteInit<int8_t>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<int8_t>, _1, _2, _current_layout);
       break;
     case DataType::FLOAT16:
-      _init_map[index] = std::bind(permuteInit<float16>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<float16>, _1, _2, _current_layout);
       break;
     case DataType::INT64:
-      _init_map[index] = std::bind(permuteInit<int64_t>, _1, _2, _current_op_seq_layout);
+      _init_map[index] = std::bind(permuteInit<int64_t>, _1, _2, _current_layout);
       break;
     default:
       throw std::runtime_error("Not supported, yet");
@@ -110,5 +112,6 @@ void IConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &in
   }
 }
 
+} // namespace cpu_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index cac43ba..8c5c46a 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -17,6 +17,7 @@
 #include "backend/cpu_common/StaticTensorManager.h"
 
 #include "backend/cpu_common/DynamicTensorManager.h"
+#include "backend/cpu_common/Tensor.h"
 #include <util/logging.h>
 
 namespace onert
@@ -27,31 +28,13 @@ namespace cpu_common
 {
 
 StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                                         DynamicMemoryManager *dynamic_mem_mgr)
-    : _const_mgr{new DynamicMemoryManager()}, _nonconst_mgr{new MemoryManager()}, _tensors{reg},
-      _dynamic_mem_mgr{dynamic_mem_mgr}
+                                         DynamicTensorManager *dynamic_tensor_manager)
+    : _nonconst_mgr{new MemoryManager()}, _tensors{reg},
+      _dynamic_tensor_manager{dynamic_tensor_manager}
 {
   // DO NOTHING
 }
 
-void StaticTensorManager::allocateConsts(void)
-{
-  for (auto &pair : _tensors->native_tensors())
-  {
-    const auto &ind = pair.first;
-    auto tensor = pair.second.get();
-    if (_as_constants[ind])
-    {
-      auto mem_alloc = _const_mgr->allocate(_tensors->getITensor(ind), tensor->total_size());
-      tensor->setBuffer(mem_alloc);
-      auto buffer = mem_alloc->base();
-      VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
-                                              << "): " << static_cast<void *>(buffer)
-                                              << "size : " << tensor->total_size() << std::endl;
-    }
-  }
-}
-
 void StaticTensorManager::allocateNonconsts(void)
 {
   _nonconst_mgr->allocate();
@@ -65,14 +48,12 @@ void StaticTensorManager::allocateNonconsts(void)
       auto *buffer = _nonconst_mgr->getBuffer(ind);
       tensor->setBuffer(buffer);
 
-      VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
-                                              << "): " << static_cast<void *>(buffer) << std::endl;
+      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+                                       << "): " << static_cast<void *>(buffer) << std::endl;
     }
   }
 }
 
-void StaticTensorManager::deallocateConsts(void) { _const_mgr->deallocate(); }
-
 void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
 
 void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
@@ -80,8 +61,17 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
                                       bool as_const)
 {
   assert(!_tensors->getNativeTensor(ind));
-  auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr);
-  _tensors->setNativeTensor(ind, std::move(tensor));
+  if (as_const)
+  {
+    auto tensor = std::make_unique<ExternalTensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, std::move(tensor));
+  }
+  else
+  {
+    auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout,
+                                           _dynamic_tensor_manager->dynamic_mem_mgr().get());
+    _tensors->setNativeTensor(ind, std::move(tensor));
+  }
   _as_constants[ind] = as_const;
 }
 
diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
index d3dcf9a..e412cb7 100644
--- a/runtime/onert/core/src/backend/cpu_common/Tensor.cc
+++ b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
@@ -95,3 +95,20 @@ bool Tensor::applyShape(const ir::Shape &new_shape)
 } // namespace cpu_common
 } // namespace backend
 } // namespace onert
+
+// ExternalTensor
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu_common
+{
+
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+ExternalTensor::~ExternalTensor() {}
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/BackendManager.cc b/runtime/onert/core/src/compiler/BackendManager.cc
index 0093f50..ea45cbe 100644
--- a/runtime/onert/core/src/compiler/BackendManager.cc
+++ b/runtime/onert/core/src/compiler/BackendManager.cc
@@ -69,55 +69,73 @@ void BackendManager::loadBackend(const std::string &backend)
     return;
   }
 
-  // TODO Remove indentation
+  const std::string backend_so = "libbackend_" + backend + SHARED_LIB_EXT;
+  void *handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+  if (handle == nullptr)
   {
-    const std::string backend_so = "libbackend_" + backend + SHARED_LIB_EXT;
-    void *handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    VERBOSE(BackendManager) << "Failed to load backend '" << backend << "' - " << dlerror() << "\n";
+    return;
+  }
 
-    if (handle == nullptr)
+  VERBOSE(BackendManager) << "Successfully loaded '" << backend << "'(" << backend_so << ")\n";
+
+  {
+    // load object creator function
+    auto backend_create = (backend_create_t)dlsym(handle, "onert_backend_create");
+    if (backend_create == nullptr)
     {
-      VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl;
+      // TODO replace `fprintf` with `VERBOSE`
+      fprintf(stderr, "BackendManager: unable to find function `onert_backend_create` : %s\n",
+              dlerror());
+      dlclose(handle);
       return;
     }
 
-    VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n";
-
+    // load object creator function
+    auto backend_destroy = (backend_destroy_t)dlsym(handle, "onert_backend_destroy");
+    if (backend_destroy == nullptr)
     {
-      // load object creator function
-      auto backend_create = (backend_create_t)dlsym(handle, "onert_backend_create");
-      if (backend_create == nullptr)
-      {
-        fprintf(stderr, "BackendManager: unable to open function onert_backend_create : %s\n",
-                dlerror());
-        abort();
-      }
-
-      // load object creator function
-      auto backend_destroy = (backend_destroy_t)dlsym(handle, "onert_backend_destroy");
-      if (backend_destroy == nullptr)
-      {
-        fprintf(stderr, "BackendManager: unable to open function onert_backend_destroy : %s\n",
-                dlerror());
-        abort();
-      }
-
-      auto backend_object =
-          std::unique_ptr<backend::Backend, backend_destroy_t>(backend_create(), backend_destroy);
-      bool initialized = backend_object->config()->initialize(); // Call initialize here?
-      if (!initialized)
-      {
-        VERBOSE_F() << backend.c_str() << " backend initialization failed. Don't use this backend"
-                    << std::endl;
-        dlclose(handle);
-        return;
-      }
-      _gen_map.emplace(backend_object->config()->id(), std::move(backend_object));
+      // TODO replace `fprintf` with `VERBOSE`
+      fprintf(stderr, "BackendManager: unable to find `function onert_backend_destroy` : %s\n",
+              dlerror());
+      dlclose(handle);
+      return;
     }
 
-    // Save backend handle (avoid warning by handle lost without dlclose())
-    auto u_handle = std::unique_ptr<void, dlhandle_destroy_t>{handle, [](void *h) { dlclose(h); }};
-    _handle_map.emplace(backend, std::move(u_handle));
+    auto backend_object =
+        std::unique_ptr<backend::Backend, backend_destroy_t>(backend_create(), backend_destroy);
+    bool initialized = backend_object->config()->initialize(); // Call initialize here?
+    if (!initialized)
+    {
+      VERBOSE(BackendManager) << backend.c_str()
+                              << " backend initialization failed. Don't use this backend"
+                              << std::endl;
+      dlclose(handle);
+      return;
+    }
+    _gen_map.emplace(backend_object->config()->id(), std::move(backend_object));
   }
+
+  // Save backend handle (avoid warning by handle lost without dlclose())
+
+  // NOTE This is a workaround for clang-format3.9 (seems like it does not understand
+  //      "by-copy capture with an initializer"
+  // clang-format off
+  auto u_handle = std::unique_ptr<void, dlhandle_destroy_t>{
+      handle, [id = backend, filename = backend_so](void *h) {
+        if (dlclose(h) == 0)
+        {
+          VERBOSE(BackendManager) << "Successfully unloaded '" << id << "'(" << filename << ")\n";
+        }
+        else
+        {
+          VERBOSE(BackendManager)
+              << "Failed to unload backend '" << id << "'- " << dlerror() << "\n";
+        }
+      }};
+// clang-format on
+_handle_map.emplace(backend, std::move(u_handle));
 }
 
 backend::Backend *BackendManager::get(const std::string &key)
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index c2844bd..7eeb14a 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -41,6 +41,30 @@
 #include "ir/OperationDumper.h"
 #include "misc/string_helpers.h"
 
+namespace
+{
+
+using namespace onert;
+
+std::string getOpBackends(std::unordered_map<ir::OpCode, std::string> &opcode_to_backend)
+{
+  std::unordered_map<ir::OpCode, std::string>::iterator it;
+  std::string opbackends;
+
+  for (it = opcode_to_backend.begin(); it != opcode_to_backend.end(); ++it)
+  {
+    if (!opbackends.empty())
+      opbackends = opbackends + ", ";
+
+    auto opcode = it->first;
+    const std::string opname = ir::toString(opcode);
+    opbackends += opname + "=" + it->second;
+  }
+  return opbackends;
+}
+
+} // namespace
+
 namespace onert
 {
 
@@ -51,7 +75,6 @@ CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs)
 {
   CompilerOptions options;
   options.backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';');
-  options.is_primary_subgraph = false;
   options.trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH);
   options.graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP);
   options.op_seq_max_node = util::getConfigInt(util::config::OP_SEQ_MAX_NODE);
@@ -108,13 +131,15 @@ CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs)
   return options;
 }
 
-Compiler::Compiler(const std::shared_ptr<ir::Subgraphs> &subgs)
+Compiler::Compiler(const std::shared_ptr<ir::Subgraphs> &subgs, util::TracingCtx *tracing_ctx)
     : _subgraphs{subgs}, _state{State::CREATED}
 {
   // Set default values for CompilerOptions
   // All these default values should not be fetched from Env, when we stop supporting Android NN
   // API.
   _options = fetchCompilerOptionsFromGlobalConfig(*subgs);
+
+  _options.tracing_ctx = tracing_ctx;
 }
 
 void Compiler::enableToFp16() { _options.fp16_enable = true; }
@@ -132,12 +157,10 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
 {
   // Set control flow backend for control flow operators
   {
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] =
-        backend::controlflow::Config::ID;
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] =
-        backend::controlflow::Config::ID;
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] =
-        backend::controlflow::Config::ID;
+    auto &cfid = backend::controlflow::Config::ID;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = cfid;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = cfid;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = cfid;
   }
 
   // FIXME This is a workaround for bcq operations, should remove it
@@ -157,7 +180,11 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     VERBOSE(Compiler) << "graph_dump_level         : " << _options.graph_dump_level << std::endl;
     VERBOSE(Compiler) << "op_seq_max_node          : " << _options.op_seq_max_node << std::endl;
     VERBOSE(Compiler) << "executor                 : " << _options.executor << std::endl;
-    VERBOSE(Compiler) << "manual_scheduler_options : (Too many things to print)" << std::endl;
+    VERBOSE(Compiler) << "manual backend_for_all   : "
+                      << _options.manual_scheduler_options.backend_for_all << std::endl;
+    VERBOSE(Compiler) << "manual_scheduler_options : "
+                      << getOpBackends(_options.manual_scheduler_options.opcode_to_backend)
+                      << std::endl;
     VERBOSE(Compiler) << "he_scheduler             : " << _options.he_scheduler << std::endl;
     VERBOSE(Compiler) << "he_profiling_mode        : " << _options.he_profiling_mode << std::endl;
     VERBOSE(Compiler) << "disable_compile          : " << _options.disable_compile << std::endl;
@@ -202,7 +229,6 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
   // Lower: Assign backend
   std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>> lowered_subgs;
   _subgraphs->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
-    _options.is_primary_subgraph = (index == ir::SubgraphIndex{0});
     onert::dumper::dot::DotDumper dot_dumper(subg, dump_level);
     dot_dumper.dump(nnfw::misc::str("before_lower_subg-", index.value()));
 
@@ -230,6 +256,14 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
 
   _subgraphs.reset();
 
+  for (auto &pair : lowered_subgs)
+  {
+    const auto &subg_index = pair.first;
+    auto &lowered_subg = pair.second;
+    onert::dumper::dot::DotDumper dot_dumper_lowered(lowered_subg.get(), dump_level);
+    dot_dumper_lowered.dump("after_lower_subg-" + std::to_string(subg_index.value()));
+  }
+
   // Shape inference.
   {
     const auto primary_subg_idx = ir::SubgraphIndex{0};
@@ -266,12 +300,8 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     auto &lowered_subg = pair.second;
     auto indexed_ranks = lowered_subg->indexed_ranks();
 
-    _options.is_primary_subgraph = (subg_index == ir::SubgraphIndex{0});
-
-    onert::dumper::dot::DotDumper dot_dumper_lowered(lowered_subg.get(), dump_level);
-    dot_dumper_lowered.dump("after_lower_subg-" + std::to_string(subg_index.value()));
-
-    ir::OperationDumper dumper("START SUBGRAPH " + std::to_string(subg_index.value()));
+    ir::OperationDumper dumper("Executor generation of Subgraph " +
+                               std::to_string(subg_index.value()));
     lowered_subg->graph().operations().iterate(
         [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
     auto executor = std::unique_ptr<exec::IExecutor>{
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index bb325ff..356feed 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -16,6 +16,7 @@
 
 #include "ExecutorFactory.h"
 
+#include <deque>
 #include <functional>
 #include "exec/ExecutionObservers.h"
 #include "exec/LinearExecutor.h"
@@ -25,16 +26,13 @@
 #include "compiler/ExecutionBuilder.h"
 #include "exec/ExecTime.h"
 #include "compiler/Linear.h"
-#include "compiler/TensorBuilders.h"
-#include "backend/IConstantInitializer.h"
-#include "backend/IKernelGenerator.h"
-#include "backend/IOptimizer.h"
 #include "backend/IPortableTensor.h"
-#include "backend/ITensorRegister.h"
 #include "backend/controlflow/Config.h"
 #include "backend/controlflow/KernelGenerator.h"
 #include "backend/controlflow/UserTensor.h"
 #include "backend/controlflow/TensorBuilder.h"
+#include "util/TracingCtx.h"
+
 #include <memory>
 
 namespace onert
@@ -66,6 +64,36 @@ private:
   std::shared_ptr<backend::IConfig> _config;
 };
 
+void initializeSubgraphIOTensors(compiler::LoweredGraph &lowered_graph,
+                                 const ir::OperandIndexSequence &indices)
+{
+  // TODO Store controlflow backend in BackendContext
+  std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
+  for (const auto &e : lowered_graph.backend_contexts())
+  {
+    auto backend = e.first;
+    auto &context = e.second;
+    if (backend->config()->id() == backend::controlflow::Config::ID)
+    {
+      cf_tensor_reg =
+          std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
+    }
+  }
+  assert(cf_tensor_reg);
+
+  for (auto ind : indices)
+  {
+    const auto &operand = lowered_graph.graph().operands().at(ind);
+    auto tensor = std::make_unique<backend::controlflow::IOTensor>(
+        operand.info(),
+        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */
+        );
+
+    // Add tensor to controlflow TensorRegistry.
+    cf_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
+  }
+}
+
 } // namespace
 } // namespace onert
 
@@ -134,97 +162,6 @@ void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_g
   }
 }
 
-void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph,
-                                            const std::vector<ir::OpSequenceIndex> &order)
-{
-  for (const auto index : order)
-  {
-    const auto &op_seq = lowered_graph->op_seqs().at(index);
-    const auto backend = lowered_graph->getLowerInfo(index)->backend();
-    const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
-    auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-    auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs();
-
-    if (tensor_register)
-    {
-      // Custom registration
-      tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
-    }
-    else
-    {
-      // Default registration
-      for (const auto op_idx : op_seq)
-      {
-        const auto &op = lowered_graph->graph().operations().at(op_idx);
-        for (const auto &index :
-             (op.getInputs() | ir::Remove::UNDEFINED) + (op.getOutputs() | ir::Remove::UNDEFINED))
-        {
-          if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
-          {
-            const auto &operand_lower_info =
-                lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
-
-            // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
-            // op.getOutputs() of permute (CPU) returns tensor A
-            // but tensor A belongs to the backend of acl_cl.
-            // So, we have to make this tensor NOT registered for CPU.
-            if (operand_lower_info.backend() != backend)
-              continue;
-
-            const auto &obj = lowered_graph->graph().operands().at(index);
-            const auto frontend_layout = op_seq.getLayout();
-            const auto backend_layout = operand_lower_info.layout();
-            ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                         obj.typeInfo(), obj.info().memAllocType(),
-                                         obj.isConstant()};
-            tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-          }
-        }
-      }
-    }
-  }
-}
-
-std::vector<backend::ITensor *>
-ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
-                                          const ir::OperandIndexSequence &indices)
-{
-  std::vector<backend::ITensor *> ret;
-
-  // TODO Store controlflow backend in BackendContext
-  std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
-  std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
-  for (const auto &e : lowered_graph.backend_contexts())
-  {
-    auto backend = e.first;
-    auto &context = e.second;
-    if (backend->config()->id() == backend::controlflow::Config::ID)
-    {
-      cf_tensor_builder =
-          std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder);
-      cf_tensor_reg =
-          std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
-    }
-  }
-  assert(cf_tensor_builder);
-  assert(cf_tensor_reg);
-
-  for (auto ind : indices)
-  {
-    const auto &operand = lowered_graph.graph().operands().at(ind);
-    auto tensor = std::make_unique<backend::controlflow::UserTensor>(
-        operand.info(),
-        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */
-        );
-
-    // Add tensor to controlflow TensorRegistry.
-    cf_tensor_reg->setNativeUserTensor(ind, std::move(tensor));
-    auto *itensor = cf_tensor_reg->getITensor(ind);
-    ret.push_back(itensor);
-  }
-  return ret;
-}
-
 void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph)
 {
   TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
@@ -260,110 +197,78 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
 
   initializeBackendContext(lowered_graph.get());
 
-  // linearize
-  assert(!lowered_graph->graph().isBuildingPhase());
-
-  /*************************************************
-   * Backend dependent analysis & optimization phase
-   *************************************************/
-
-  for (auto &pair : backend_contexts)
-  {
-    auto &optimizer = pair.second->optimizer;
-    if (optimizer)
-      optimizer->optimize();
-  }
+  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
 
-  /**********************************************************
-   * Backend dependent analysis & optimization phase finished
-   **********************************************************/
+  assert(!lowered_graph->graph().isBuildingPhase());
 
-  /***********************
-   * Code generation phase
-   ***********************/
+  initializeSubgraphIOTensors(
+      *lowered_graph, (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+                          ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
+  // linearize
   auto order = Linear::linearize(*lowered_graph);
-  runTensorRegistration(lowered_graph.get(), order);
-
-  std::vector<backend::ITensor *> input_tensors;
-  std::vector<backend::ITensor *> output_tensors;
-  if (options.is_primary_subgraph)
-  {
-    input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
-    output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
-  }
-
   Linear::dump(*lowered_graph, order);
-  Linear::planTensors(*lowered_graph, order);
 
-  TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
-  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
-
-  for (auto &tensor_builder : tensor_builders)
+  for (auto &pair : backend_contexts)
   {
-    tensor_builder->prepare();
+    pair.second->genTensors(order, lowered_graph->op_seqs(), *lowered_graph->getLowerInfo());
   }
 
   prepareMigrantTensors(*lowered_graph);
 
-  ExecutionBuilder builder;
-
-  // Generate kernels
-  lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
-                                        const ir::OpSequence &op_seq) {
-    auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
-    auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
-    // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
-    auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
-    if (cf_kernel_gen != nullptr)
+  // Give some runtime objects to controlflow KernelGenerator
+  for (auto &pair : backend_contexts)
+  {
+    auto cf_context = dynamic_cast<backend::controlflow::BackendContext *>(pair.second.get());
+    if (cf_context != nullptr)
     {
+      auto cf_kernel_gen = cf_context->kernel_gen;
       cf_kernel_gen->setTensorRegistries(tensor_regs);
       cf_kernel_gen->setExecutorMap(executor_map);
     }
-    auto fn_seq = kernel_gen->generate(op_seq);
-    if (options.he_profiling_mode)
-    {
-      fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
-    }
-    builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
-  });
-
-  for (auto &tensor_builder : tensor_builders)
-  {
-    tensor_builder->allocate();
   }
 
+  ExecutionBuilder builder;
+
+  // Adjust the order of backends for the upcoming iteration
+  std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
   for (auto &pair : backend_contexts)
   {
-    pair.second->initConsts();
+    // NOTE controlflow backend must be processed lastly.
+    // This is because of Permute layer's specialty which is the only operation that could have
+    // different ITensor objects for the input and the output. And it requires all other backends'
+    // tensors are ready to use.
+    if (pair.first->config()->id() == "controlflow")
+      ordered_contexts.emplace_back(pair.first, pair.second.get());
+    else
+      ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
 
-  lowered_graph->graph().operands().iterate(
-      [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
-
-  auto code_map = builder.releaseCodeMap();
-
-  for (auto &it : code_map)
+  // Generate kernels
+  for (auto &pair : ordered_contexts)
   {
-    auto op_seq_index = it.first;
-    auto &fn_seq = it.second.fn_seq;
-
-    fn_seq->iterate([&](exec::IFunction &ifunc) {
-      ifunc.prepare();
-      auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
-      auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-      tensor_builder->postFunctionPrepare();
-    });
+    auto codes = pair.second->genKernels(order, lowered_graph->op_seqs());
+    for (auto &pair : codes)
+    {
+      auto &op_seq_ind = pair.first;
+      auto &fn_seq = pair.second;
+      auto &op_seq = lowered_graph->op_seqs().at(op_seq_ind);
+      auto lower_info = lowered_graph->getLowerInfo(op_seq_ind);
+      if (options.he_profiling_mode)
+        fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      builder.append(op_seq_ind, {&op_seq, lower_info, std::move(fn_seq)});
+    }
   }
 
-  auto exec =
-      new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                               std::move(code_map),      order};
+  auto code_map = builder.releaseCodeMap();
+
+  auto exec = new exec::LinearExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map),
+                                       order, options.tracing_ctx};
 
   if (!options.trace_filepath.empty())
   {
-    std::unique_ptr<exec::IExecutionObserver> ctp =
-        std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
+    std::unique_ptr<exec::IExecutionObserver> ctp = std::make_unique<exec::TracingObserver>(
+        options.trace_filepath, exec->graph(), options.tracing_ctx);
     exec->addObserver(std::move(ctp));
   }
 
@@ -378,100 +283,81 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
 
   initializeBackendContext(lowered_graph.get());
 
-  auto order = Linear::linearize(*lowered_graph);
-  runTensorRegistration(lowered_graph.get(), order);
-
-  std::vector<backend::ITensor *> input_tensors;
-  std::vector<backend::ITensor *> output_tensors;
-  if (options.is_primary_subgraph)
-  {
-    input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
-    output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
-  }
-
-  TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
   TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
 
-  // To make tensors never be deallocated, this is a workaround to use static memory planner
-  for (auto &tensor_builder : tensor_builders)
-  {
-    lowered_graph->graph().operands().iterate(
-        [&](const ir::OperandIndex &ind, const ir::Operand &) {
-          if (tensor_builder->isRegistered(ind))
-          {
-            tensor_builder->notifyFirstUse(ind);
-          }
-        });
-  }
+  assert(!lowered_graph->graph().isBuildingPhase());
+
+  initializeSubgraphIOTensors(
+      *lowered_graph, (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+                          ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
-  for (auto &tensor_builder : tensor_builders)
+  // linearize
+  // This order is just for giving topological order info to the backens
+  // TODO When we pass a partial graph to a backend, we can remove this
+  auto order = Linear::linearize(*lowered_graph);
+  for (auto &pair : backend_contexts)
   {
-    tensor_builder->prepare();
+    pair.second->genTensors(order, lowered_graph->op_seqs(), *lowered_graph->getLowerInfo());
   }
 
   prepareMigrantTensors(*lowered_graph);
 
-  ExecutionBuilder builder;
-
-  // Generate kernels
-  lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
-                                        const ir::OpSequence &op_seq) {
-    auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
-    auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
-    // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
-    auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
-    if (cf_kernel_gen != nullptr)
+  // Give some runtime objects to controlflow KernelGenerator
+  for (auto &pair : backend_contexts)
+  {
+    auto cf_context = dynamic_cast<backend::controlflow::BackendContext *>(pair.second.get());
+    if (cf_context != nullptr)
     {
-      assert(cf_kernel_gen != nullptr);
+      auto cf_kernel_gen = cf_context->kernel_gen;
       cf_kernel_gen->setTensorRegistries(tensor_regs);
       cf_kernel_gen->setExecutorMap(executor_map);
     }
-    auto fn_seq = kernel_gen->generate(op_seq);
-    if (options.he_profiling_mode)
-    {
-      fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
-    }
-    builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
-  });
-
-  for (const auto &tensor_builder : tensor_builders)
-  {
-    tensor_builder->allocate();
   }
 
+  ExecutionBuilder builder;
+
+  // Adjust the order of backends for the upcoming iteration
+  std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
   for (auto &pair : backend_contexts)
   {
-    pair.second->initConsts();
+    // NOTE controlflow backend must be processed lastly.
+    // This is because of Permute layer's specialty which is the only operation that could have
+    // different ITensor objects for the input and the output. And it requires all other backends'
+    // tensors are ready to use.
+    if (pair.first->config()->id() == "controlflow")
+      ordered_contexts.emplace_back(pair.first, pair.second.get());
+    else
+      ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
 
-  lowered_graph->graph().operands().iterate(
-      [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
-
-  auto code_map = builder.releaseCodeMap();
-
-  for (auto &it : code_map)
+  // Generate kernels
+  for (auto &pair : ordered_contexts)
   {
-    auto op_seq_index = it.first;
-    auto &fn_seq = it.second.fn_seq;
-
-    fn_seq->iterate([&](exec::IFunction &ifunc) {
-      ifunc.prepare();
-      auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
-      auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-      tensor_builder->postFunctionPrepare();
-    });
+    auto codes = pair.second->genKernels(order, lowered_graph->op_seqs());
+    for (auto &pair : codes)
+    {
+      auto &op_seq_ind = pair.first;
+      auto &fn_seq = pair.second;
+      auto &op_seq = lowered_graph->op_seqs().at(op_seq_ind);
+      auto lower_info = lowered_graph->getLowerInfo(op_seq_ind);
+      if (options.he_profiling_mode)
+        fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      builder.append(op_seq_ind, {&op_seq, lower_info, std::move(fn_seq)});
+    }
   }
 
+  auto code_map = builder.releaseCodeMap();
+
   exec::ExecutorBase *exec = nullptr;
   if (parallel)
   {
-    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
-                                      tensor_regs, std::move(code_map)};
+    exec = new exec::ParallelExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map),
+                                      options.tracing_ctx};
   }
   else
   {
-    auto dataflow_exec = new exec::DataflowExecutor{
-        std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, std::move(code_map)};
+    auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), tensor_regs,
+                                                    std::move(code_map), options.tracing_ctx};
     if (options.he_profiling_mode)
     {
       std::vector<const backend::Backend *> backends;
@@ -489,8 +375,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
 
   if (!options.trace_filepath.empty())
   {
-    std::unique_ptr<exec::IExecutionObserver> ctp =
-        std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
+    std::unique_ptr<exec::IExecutionObserver> ctp = std::make_unique<exec::TracingObserver>(
+        options.trace_filepath, exec->graph(), options.tracing_ctx);
     exec->addObserver(std::move(ctp));
   }
 
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index e76b721..06dc691 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -46,9 +46,6 @@ private:
   static void initializeBackendContext(compiler::LoweredGraph *lowered_graph);
   static void runTensorRegistration(compiler::LoweredGraph *lowered_graph,
                                     const std::vector<ir::OpSequenceIndex> &order);
-  static std::vector<backend::ITensor *>
-  initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
-                           const ir::OperandIndexSequence &indices);
   static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc
index 30c8f72..fdd2a76 100644
--- a/runtime/onert/core/src/compiler/Linear.cc
+++ b/runtime/onert/core/src/compiler/Linear.cc
@@ -19,8 +19,6 @@
 #include "Linear.h"
 
 #include "backend/IConfig.h"
-#include "backend/IConstantInitializer.h"
-#include "backend/ITensorRegister.h"
 #include "backend/Backend.h"
 #include "util/logging.h"
 
@@ -62,190 +60,5 @@ void Linear::dump(const compiler::LoweredGraph &lowered_graph,
   }
 }
 
-void Linear::planTensors(const compiler::LoweredGraph &lowered_graph,
-                         const std::vector<ir::OpSequenceIndex> &order)
-{
-  const auto &graph = lowered_graph.graph();
-  ir::OperandIndexMap<std::shared_ptr<backend::ITensorBuilder>> tensor_builder_map;
-
-  ir::OperandIndexMap<uint32_t> uses_map;
-  ir::OperandIndexMap<uint32_t> def_map;
-  ir::OperandIndexSequence constants;
-
-  // Prepare scanning
-  graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
-    const auto lower_info = lowered_graph.getLowerInfo(ind);
-    // TODO Remove if onert doesn't support anymore such as
-    // GeneratedTests.reshape_quant8_weights_as_inputs
-    if (lower_info->def_factors().size() == 0 && lower_info->use_factors().size() == 0 &&
-        !graph.getInputs().contains(ind))
-    {
-      VERBOSE(LINEAR) << "Operand #" << ind.value() << " will not be used. no more process."
-                      << std::endl;
-      return;
-    }
-
-    // Unused input of subgraph
-    // TODO Register unused input as nullptr in tensor_builder
-    if (lower_info->def_factors().size() == 0 && lower_info->use_factors().size() == 0 &&
-        graph.getInputs().contains(ind))
-    {
-      VERBOSE(LINEAR) << "Operand #" << ind.value() << " will not be used. no more process."
-                      << std::endl;
-      return;
-    }
-
-    uses_map[ind] = obj.getUses().size();
-    def_map[ind] = obj.getDef().valid() ? 1 : 0;
-
-    bool is_const = obj.isConstant();
-    if (is_const)
-    {
-      constants.append(ind);
-    }
-
-    auto factor = lower_info->def_factors().getOnlyElement();
-    auto backend = factor.backend();
-    auto tensor_builder = lowered_graph.backend_contexts().at(backend)->tensor_builder;
-    if (!tensor_builder->isRegistered(ind))
-    {
-      // These tensors do not exist in any op_seq (No use and def)
-      const auto info = obj.info();
-      const auto backend_layout = factor.layout();
-      // TODO Change tensor info to have permuted shape
-      tensor_builder->registerTensorInfo(ind, info, backend_layout);
-    }
-
-    tensor_builder_map[ind] = tensor_builder;
-  });
-
-  const auto io_tensors =
-      (graph.getInputs() + graph.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
-
-  // If a tensor is model output, increase the use of the tensor.
-  // This aim is same to above one.
-  for (const auto &ind : io_tensors)
-  {
-    uses_map[ind]++;
-  }
-
-  // Start scanning to do notify{First|Last}Use for each tensor
-
-  // If a tensor is a constant, increase the use of the tensor.
-  // It makes the tensor not be dealloced. It means these will be deallocated last.
-  // And allocate constant operands first
-  VERBOSE(LINEAR) << "TENSORS as CONSTANT" << std::endl;
-  for (const auto &ind : constants)
-  {
-    uses_map[ind]++;
-    tensor_builder_map[ind]->notifyFirstUse(ind);
-  }
-
-  // Allocate Model's inputs
-  VERBOSE(LINEAR) << "TENSORS as MODEL INPUT" << std::endl;
-  for (const auto &ind : graph.getInputs() | ir::Remove::DUPLICATED)
-  {
-    auto tensor_builder = tensor_builder_map[ind];
-    if (!tensor_builder) // for GeneratedTests.xxx_weights_as_inputs
-      continue;
-    tensor_builder->notifyFirstUse(ind);
-  }
-
-  // At each operation,
-  // 1. Scan DEF of outputs. If the DEF, allocate it
-  // 2. Scan DEF of inputs. If variable tensor, allocate it
-  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
-  VERBOSE(LINEAR) << "TENSORS" << std::endl;
-  for (const auto op_seq_ind : order)
-  {
-    const auto &op_seq = lowered_graph.op_seqs().at(op_seq_ind);
-    for (const auto &op_idx : op_seq.operations())
-    {
-      for (const auto &ind : graph.operations().at(op_idx).getOutputs() | ir::Remove::DUPLICATED |
-                                 ir::Remove::UNDEFINED)
-      {
-        assert(def_map.find(ind) != def_map.end());
-        if (def_map[ind])
-        {
-          def_map[ind] = 0;
-          tensor_builder_map[ind]->notifyFirstUse(ind);
-        }
-      }
-
-      // Scan variable tensors
-      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
-      // non-constant because of less memory usage by memory planning in here
-      for (const auto &ind : graph.operations().at(op_idx).getInputs() | ir::Remove::DUPLICATED |
-                                 ir::Remove::UNDEFINED)
-      {
-        const auto &operand = graph.operands().at(ind);
-        if (operand.info().isVariable())
-        {
-          // The variable tensor with buffer is not supported yet
-          assert(operand.data() == nullptr);
-          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
-          assert(lowered_graph.getLowerInfo(ind)->def_factors().size() == 1 &&
-                 lowered_graph.getLowerInfo(ind)->use_factors().size() == 1);
-          assert(uses_map[ind] == 1 && def_map[ind] == 0);
-          tensor_builder_map[ind]->notifyFirstUse(ind);
-        }
-      }
-
-      for (const auto &ind : graph.operations().at(op_idx).getInputs() | ir::Remove::DUPLICATED |
-                                 ir::Remove::UNDEFINED)
-      {
-        assert(uses_map.find(ind) != uses_map.end());
-        assert(uses_map[ind] > 0);
-        uses_map[ind]--;
-        if (uses_map[ind] == 0)
-        {
-          // plan for deallocation of static tensornode
-          tensor_builder_map[ind]->notifyLastUse(ind);
-
-          // plan for deallocation of dynamic tensor
-          auto dyn_tensor_manager = tensor_builder_map[ind]->dynamicTensorManager();
-          if (dyn_tensor_manager)
-          {
-            const auto *backend =
-                lowered_graph.getLowerInfo(ind)->def_factors().getOnlyElement().backend();
-            auto &tensor_registry = lowered_graph.backend_contexts().at(backend)->tensor_registry;
-            auto *tensor = tensor_registry->getITensor(ind);
-            assert(tensor);
-            if (!io_tensors.contains(ind)) // I/O tensors cannot be deallocated
-              dyn_tensor_manager->planDealloc(op_idx, tensor);
-          }
-        }
-      }
-    }
-  }
-
-  // Dispose and validate
-  for (const auto &ind : io_tensors)
-  {
-    --uses_map[ind];
-    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
-    {
-      tensor_builder_map[ind]->notifyLastUse(ind);
-    }
-  }
-
-  for (const auto &ind : constants)
-  {
-    --uses_map[ind];
-    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
-    {
-      tensor_builder_map[ind]->notifyLastUse(ind);
-    }
-  }
-
-  assert(
-      std::all_of(uses_map.begin(), uses_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
-
-  assert(
-      std::all_of(def_map.begin(), def_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
-}
-
 } // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/Linear.h b/runtime/onert/core/src/compiler/Linear.h
index 1e24cf9..56b42cc 100644
--- a/runtime/onert/core/src/compiler/Linear.h
+++ b/runtime/onert/core/src/compiler/Linear.h
@@ -22,7 +22,6 @@
 
 #include "ir/OpSequences.h"
 #include "ir/Index.h"
-#include "backend/ITensorBuilder.h"
 #include "compiler/LoweredGraph.h"
 
 namespace onert
@@ -44,8 +43,6 @@ public:
   static std::vector<ir::OpSequenceIndex> linearize(const compiler::LoweredGraph &lowered_graph);
   static void dump(const compiler::LoweredGraph &lowered_graph,
                    const std::vector<ir::OpSequenceIndex> &order);
-  static void planTensors(const compiler::LoweredGraph &lowered_graph,
-                          const std::vector<ir::OpSequenceIndex> &order);
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index 673d7d3..6d5210d 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -32,6 +32,7 @@
 #include "compiler/BackendResolver.h"
 #include "compiler/ManualScheduler.h"
 #include "compiler/HEScheduler.h"
+#include "util/TracingCtx.h"
 
 namespace onert
 {
@@ -40,6 +41,13 @@ namespace compiler
 
 LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &options) : _graph{graph}
 {
+  // set tracing_ctx for copied graph
+  if (options.tracing_ctx)
+  {
+    auto subgraph_index = options.tracing_ctx->getSubgraphIndex(&graph);
+    options.tracing_ctx->setSubgraphIndex(&_graph, subgraph_index.value());
+  }
+
   bool linear_executor = (options.executor == "Linear");
 
   // Build backend contexts
@@ -112,7 +120,7 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
         .run();
 
     // Set LowerInfo for each operand from the operand::LowerInfo holder
-    manipulateLowerInfo(operands_lower_info, options.is_primary_subgraph);
+    manipulateLowerInfo(operands_lower_info);
 
     dumpLowerInfo();
   }
@@ -126,7 +134,11 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
   // Optimization passes
   pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run();
 
-  VERBOSE(OpSequences) << "Dump after permutation insertion" << std::endl;
+  VERBOSE(LoweredGraph) << "Dump after permutation insertion" << std::endl;
+  for (auto operand : _graph.getInputs())
+    VERBOSE(LoweredGraph) << "Graph Input : " << operand << std::endl;
+  for (auto operand : _graph.getOutputs())
+    VERBOSE(LoweredGraph) << "Graph Output : " << operand << std::endl;
   dumpOpSequences(_op_seqs, _graph.operations());
 
   // Graph verifications
@@ -322,50 +334,22 @@ void LoweredGraph::makeOpSequences(
 }
 
 void LoweredGraph::manipulateLowerInfo(
-    ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info,
-    bool is_primary)
+    ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info)
 {
   const auto controlflow_backend = BackendManager::get().getControlflow();
 
-  // TODO Rather than handling primary graph specially,
-  //      let the permute inserted and remove it later
-  if (is_primary)
+  // TODO Rather than using NHWC Get frontend layout of this node from IR
+  auto factor = ir::operand::PermuteFactor{controlflow_backend, ir::Layout::NHWC};
+  for (auto index : _graph.getInputs() | ir::Remove::UNDEFINED)
   {
-    // TODO Rather than using NHWC Get frontend layout of this node from IR
-    auto factor = ir::operand::PermuteFactor{controlflow_backend, ir::Layout::NHWC};
-    for (auto index : _graph.getInputs() | ir::Remove::UNDEFINED)
-    {
-      auto &&lower_info = operands_lower_info.at(index);
-      assert(lower_info->def_factors().empty());
-      lower_info->addDefPermuteFactor(factor);
-    }
-    for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
-    {
-      auto &&lower_info = operands_lower_info.at(index);
-      lower_info->addUsePermuteFactor(factor);
-    }
+    auto &&lower_info = operands_lower_info.at(index);
+    assert(lower_info->def_factors().empty());
+    lower_info->addDefPermuteFactor(factor);
   }
-  else
+  for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
   {
-    for (auto index : _graph.getInputs() | ir::Remove::UNDEFINED)
-    {
-      auto &&lower_info = operands_lower_info.at(index);
-      if (!(lower_info->def_factors().size() == 0 && lower_info->use_factors().size() == 0))
-      {
-        // In case of not that Graph's input is not used in any operation and not the graph's
-        // output.
-        // In other words, it is not unused input in Graph.
-        lower_info->addDefPermuteFactor(*lower_info->use_factors().begin());
-      }
-      else
-      {
-        // In case of that an operand is Graph's input and not input or output of any operation
-        lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{
-            controlflow_backend,
-            ir::Layout::NHWC // TODO Get frontend layout of this node from IR
-        });
-      }
-    }
+    auto &&lower_info = operands_lower_info.at(index);
+    lower_info->addUsePermuteFactor(factor);
   }
   for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
   {
@@ -446,8 +430,11 @@ void LoweredGraph::dumpLowerInfo()
         sstream << (shape.dim(i)) << " ";
       }
       sstream << "}" << std::endl;
-      sstream << "  - Def ir::Operations  : " << def_ops << std::endl;
-      sstream << "  - Use ir::Operations  : " << use_ops << std::endl;
+      sstream << "  - Def Operations  : " << def_ops << std::endl;
+      sstream << "  - Use Operations  : " << use_ops << std::endl;
+      sstream << "  - Data            : "
+              << (object.data() ? (std::to_string(object.data()->size()) + " bytes") : "N/A")
+              << std::endl;
       sstream << "  - Lower Info" << std::endl;
       sstream << "    - Def Backends    : " << def_layouts << std::endl;
       sstream << "    - Use Backends    : " << use_layouts << std::endl;
diff --git a/runtime/onert/core/src/compiler/ManualScheduler.cc b/runtime/onert/core/src/compiler/ManualScheduler.cc
index ed49ee5..1f4a478 100644
--- a/runtime/onert/core/src/compiler/ManualScheduler.cc
+++ b/runtime/onert/core/src/compiler/ManualScheduler.cc
@@ -100,10 +100,11 @@ std::unique_ptr<BackendResolver> ManualScheduler::schedule(const ir::Graph &grap
   }
 
   // Dump final assignment
-  backend_resolver->iterate([&](const ir::OperationIndex &index, const backend::Backend &backend) {
-    VERBOSE(ManualScheduler) << "backend for operation #" << index.value() << ": "
-                             << backend.config()->id() << std::endl;
-  });
+  WHEN_LOG_ENABLED(backend_resolver->iterate(
+      [&](const ir::OperationIndex &index, const backend::Backend &backend) {
+        VERBOSE(ManualScheduler) << "backend for operation #" << index.value() << ": "
+                                 << backend.config()->id() << std::endl;
+      }));
 
   return backend_resolver;
 }
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.cc b/runtime/onert/core/src/compiler/ShapeValidator.cc
index c18178d..e0c9f52 100644
--- a/runtime/onert/core/src/compiler/ShapeValidator.cc
+++ b/runtime/onert/core/src/compiler/ShapeValidator.cc
@@ -37,7 +37,7 @@ namespace compiler
 {
 
 ShapeValidator::ShapeValidator(const ir::Graph &graph)
-    : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+    : _graph{graph}, _ctx{graph.operands()}, _current_layout{ir::Layout::UNKNOWN}
 {
 }
 
@@ -59,7 +59,7 @@ void ShapeValidator::operator()()
   // creating Compiler
   assert(_graph.subgraphs() == nullptr);
 
-  _current_op_seq_layout = _graph.layout();
+  _current_layout = _graph.layout();
 
   _graph.operations().iterate(
       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
@@ -90,7 +90,7 @@ void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
   const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
 
@@ -101,6 +101,14 @@ void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node)
 
   OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
 
+  if (node.getInputs().size() != 2)
+  {
+    const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
+    OP_REQUIRES(_ctx.at(crops_index).shape().rank() == 2);
+    OP_REQUIRES(_ctx.at(crops_index).shape().dim(0) == (_ctx.at(ifm_index).shape().rank() - 2));
+    OP_REQUIRES(_ctx.at(crops_index).shape().dim(1) == 2);
+  }
+
   OP_REQUIRES(input_shape.C == output_shape.C);
 }
 
@@ -330,7 +338,7 @@ void ShapeValidator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
   const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
 
@@ -355,7 +363,7 @@ void ShapeValidator::visit(const ir::operation::SpaceToDepth &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
   const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
   const auto block_size = node.param().block_size;
@@ -471,7 +479,7 @@ void ShapeValidator::visit(const ir::operation::TransposeConv &node)
   OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank());
   OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank());
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
   // The kernel has only IHWO layout on frontend
@@ -516,7 +524,7 @@ void ShapeValidator::visit(const ir::operation::DepthToSpace &node)
 
   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
 
-  const auto frontend_layout = _current_op_seq_layout;
+  const auto frontend_layout = _current_layout;
   const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout);
   const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout);
 
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.h b/runtime/onert/core/src/compiler/ShapeValidator.h
index f40c098..763cf7c 100644
--- a/runtime/onert/core/src/compiler/ShapeValidator.h
+++ b/runtime/onert/core/src/compiler/ShapeValidator.h
@@ -93,7 +93,7 @@ private:
   // TODO Remove _ctx field
   const ir::Graph &_graph;
   const ir::Operands &_ctx;
-  ir::Layout _current_op_seq_layout;
+  ir::Layout _current_layout;
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/StaticShapeInferer.cc b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
index d3b083b..1f2c6f3 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInferer.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
@@ -142,12 +142,12 @@ void StaticShapeInferer::dump()
   }
 }
 
-void StaticShapeInferer::visit(const ir::operation::ArgMax &op)
+void StaticShapeInferer::visit(const ir::operation::ArgMinMax &op)
 {
-  const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto input_idx{op.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
-  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
   const auto &axis = _operands.at(axis_idx);
 
   // get mutable output operand
@@ -166,7 +166,8 @@ void StaticShapeInferer::visit(const ir::operation::ArgMax &op)
   axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
   // re-sizing output shape
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis_value, rank);
+  ir::Shape new_shape =
+      shape_inference::inferArgMinMaxShape(input.info().shape(), axis_value, rank);
   output.info().shape(new_shape);
 }
 
@@ -335,35 +336,47 @@ void StaticShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   // even when axis is constant, output shape should be recalculated since user might call
   // nnfw_set_input_tensorinfo(input, some_new_shape)
-  auto axis_buf = reinterpret_cast<const int32_t *>(axis.data()->base());
-  assert(axis_buf);
+  auto axis_type = axis.typeInfo().type();
+  assert(axis_type == ir::DataType::INT32 || axis_type == ir::DataType::INT64);
+
+  assert(axis.data()->base());
+  int32_t axis_value =
+      (axis_type == ir::DataType::INT32)
+          ? reinterpret_cast<const int32_t *>(axis.data()->base())[0]
+          : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis.data()->base())[0]);
 
   // re-sizing output shape
-  ir::Shape new_shape = shape_inference::inferExpandDimsShape(input.info().shape(), axis_buf[0]);
+  ir::Shape new_shape = shape_inference::inferExpandDimsShape(input.info().shape(), axis_value);
   output.info().shape(new_shape);
 }
 
 void StaticShapeInferer::visit(const ir::operation::Fill &op)
 {
-  const auto input_idx{op.getInputs().at(ir::operation::Fill::Input::INPUT)};
-  const auto &input = _operands.at(input_idx);
+  const auto shape_idx{op.getInputs().at(ir::operation::Fill::Input::SHAPE)};
+  const auto &shape = _operands.at(shape_idx);
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
 
-  if (!input.isConstant())
+  if (!shape.isConstant())
   {
     output.info().setDynamic();
     _return_has_dynamic_tensor = true;
     return;
   }
 
-  assert(input.typeInfo().type() == ir::DataType::INT32);
+  const auto dims_type = shape.typeInfo().type();
+  assert(dims_type == ir::DataType::INT32 || dims_type == ir::DataType::INT64);
 
-  auto input_buf = reinterpret_cast<const int32_t *>(input.data()->base());
-  assert(input_buf);
+  auto dims_buf = shape.data()->base();
+  assert(dims_buf);
+
+  const auto &dims_shape = shape.info().shape();
+  auto new_shape = ((dims_type == ir::DataType::INT32)
+                        ? shape_inference::inferFillShape<int32_t>(
+                              dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
+                        : shape_inference::inferFillShape<int64_t>(
+                              dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
 
-  // re-sizing output shape
-  ir::Shape new_shape = shape_inference::inferFillShape(input.info().shape(), input_buf);
   output.info().shape(new_shape);
 }
 
diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
deleted file mode 100644
index 3b0360b..0000000
--- a/runtime/onert/core/src/compiler/TensorBuilders.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_COMPILER_TENSOR_BUILDERS_H__
-#define __ONERT_COMPILER_TENSOR_BUILDERS_H__
-
-#include <unordered_set>
-#include <memory>
-#include "backend/BackendContext.h"
-#include "backend/Backend.h"
-#include "backend/controlflow/Config.h"
-#include "backend/controlflow/TensorBuilder.h"
-#include "util/logging.h"
-
-namespace onert
-{
-namespace compiler
-{
-
-class TensorBuilders
-{
-public:
-  TensorBuilders() = default;
-
-  TensorBuilders(const onert::backend::BackendContexts &backend_contexts, bool include_controlflow)
-  {
-    for (const auto &e : backend_contexts)
-    {
-      if (e.first->config()->id() == backend::controlflow::Config::ID)
-      {
-        _cf_tensor_builder = std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(
-            e.second->tensor_builder);
-        if (include_controlflow)
-          _tensor_builders.insert(e.second->tensor_builder);
-      }
-      else
-      {
-        _tensor_builders.insert(e.second->tensor_builder);
-      }
-    }
-  }
-
-  std::unordered_set<std::shared_ptr<onert::backend::ITensorBuilder>>::const_iterator begin() const
-  {
-    return _tensor_builders.cbegin();
-  }
-  std::unordered_set<std::shared_ptr<onert::backend::ITensorBuilder>>::const_iterator end() const
-  {
-    return _tensor_builders.cend();
-  }
-
-  std::shared_ptr<backend::controlflow::TensorBuilder> getControlflowTensorBuilder() const
-  {
-    return _cf_tensor_builder;
-  }
-
-private:
-  std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
-  std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
-};
-
-} // namespace compiler
-} // namespace onert
-
-#endif // __ONERT_COMPILER_TENSOR_BUILDERS_H__
diff --git a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
index c83a72a..8467d51 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
@@ -130,9 +130,11 @@ ir::OperationIndex PermutationInsertionPass::insertPermute(const ir::OperandInde
 
   // Generate output operand and permute operation
   auto out_operand_index = _graph.addOperand(operand.shape(), operand.typeInfo());
-  // change model output if operand_index is model output index
+  // change model output if operand_index is model output index and the out operand is controlflow
+  // backend
   auto &model_outputs = _graph.getOutputs();
-  if (model_outputs.contains(operand_index))
+  const backend::Backend *cf_backend = compiler::BackendManager::get().getControlflow();
+  if (model_outputs.contains(operand_index) && factor.backend() == cf_backend)
   {
     model_outputs.replace(operand_index, out_operand_index);
   }
@@ -191,8 +193,10 @@ ir::OperationIndex PermutationInsertionPass::insertPermute(const ir::OperandInde
   const auto &node = _graph.operations().at(node_index);
 
   VERBOSE_F() << "Permute Op inserted, node index : " << node_index << std::endl;
-  VERBOSE_F() << "  - Input (original) Operand : " << operand_index << std::endl;
-  VERBOSE_F() << "  - Output(inserted) Operand : " << out_operand_index << std::endl;
+  VERBOSE_F() << "  - Input (original) Operand : " << operand_index << "("
+              << input_factor.backend()->config()->id() << ")" << std::endl;
+  VERBOSE_F() << "  - Output(inserted) Operand : " << out_operand_index << "("
+              << factor.backend()->config()->id() << ")" << std::endl;
 
   // OpSequence
   {
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.cc b/runtime/onert/core/src/exec/DataflowExecutor.cc
index 53bc3c2..b81a757 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.cc
+++ b/runtime/onert/core/src/exec/DataflowExecutor.cc
@@ -78,11 +78,10 @@ bool DataflowExecutor::noWaitingJobs()
 }
 
 DataflowExecutor::DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                                   const std::vector<backend::ITensor *> &input_tensors,
-                                   const std::vector<backend::ITensor *> &output_tensors,
                                    const compiler::TensorRegistries &tensor_regs,
-                                   compiler::CodeMap &&code_map)
-    : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs},
+                                   compiler::CodeMap &&code_map,
+                                   const util::TracingCtx *tracing_ctx)
+    : ExecutorBase{std::move(lowered_graph), tensor_regs, tracing_ctx},
       _code_map{std::move(code_map)}
 {
   VERBOSE(DataflowExecutor) << "Constructing Dataflow Executor" << std::endl;
@@ -143,7 +142,9 @@ void DataflowExecutor::executeImpl()
   }
   assert(!_ready_jobs.empty()); // Cannot begin if there is no initial jobs
 
-  _subject.notifyModelBegin(this);
+  auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
+
+  _subject.notifySubgraphBegin(profiling_subg_index);
 
   while (!_ready_jobs.empty())
   {
@@ -157,7 +158,7 @@ void DataflowExecutor::executeImpl()
     const backend::Backend *backend =
         _lowered_graph->getLowerInfo()->op_seq.at(op_seq_index)->backend();
 
-    _subject.notifyJobBegin(this, op_seq, backend);
+    _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
 
     job->fn_seq()->initRunning();
 
@@ -167,13 +168,13 @@ void DataflowExecutor::executeImpl()
 
     job->run();
 
-    _subject.notifyJobEnd(this, op_seq, backend);
+    _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
     notify(job_index);
     _finished_jobs[job_index] = std::move(job);
   }
   assert(noWaitingJobs());
 
-  _subject.notifyModelEnd(this);
+  _subject.notifySubgraphEnd(profiling_subg_index);
 
   // Reset input info for the next execution
   _input_info = _initial_input_info;
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.h b/runtime/onert/core/src/exec/DataflowExecutor.h
index 69dfda1..b72c0d0 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.h
+++ b/runtime/onert/core/src/exec/DataflowExecutor.h
@@ -28,6 +28,7 @@
 #include <memory>
 #include "exec/ExecutorBase.h"
 #include "compiler/CodeMap.h"
+#include "util/TracingCtx.h"
 
 namespace onert
 {
@@ -50,9 +51,8 @@ public:
    * @param code_map OpSequence and its code map
    */
   DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<backend::ITensor *> &input_tensors,
-                   const std::vector<backend::ITensor *> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
+                   const util::TracingCtx *tracing_ctx);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/exec/DynamicShapeInferer.cc b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
index 1666d3f..2d9d534 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInferer.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
@@ -92,12 +92,12 @@ void DynamicShapeInferer::handleSimpleUnaryOp(const ir::Operation &op,
   assert(output->buffer() != nullptr);
 }
 
-void DynamicShapeInferer::visit(const ir::operation::ArgMax &op)
+void DynamicShapeInferer::visit(const ir::operation::ArgMinMax &op)
 {
-  const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto input_idx{op.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
   const auto input = _tensor_registry->getITensor(input_idx);
 
-  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
   const auto axis = _tensor_registry->getITensor(axis_idx);
 
   auto output_ind = op.getOutputs().at(0);
@@ -111,7 +111,7 @@ void DynamicShapeInferer::visit(const ir::operation::ArgMax &op)
   const auto rank = input_shape.rank();
   axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis_value, rank);
+  ir::Shape new_shape = shape_inference::inferArgMinMaxShape(input_shape, axis_value, rank);
 
   output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
@@ -388,10 +388,16 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   auto axis_ind = op.getInputs().at(ir::operation::ExpandDims::AXIS);
   auto axis = _tensor_registry->getITensor(axis_ind);
-  auto axis_buf = reinterpret_cast<const int32_t *>(axis->buffer());
-  assert(axis_buf);
+  auto axis_type = axis->data_type();
+  assert(axis_type == ir::DataType::INT32 || axis_type == ir::DataType::INT64);
 
-  auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_buf[0]);
+  assert(axis->buffer());
+  int32_t axis_value =
+      (axis_type == ir::DataType::INT32)
+          ? reinterpret_cast<const int32_t *>(axis->buffer())[0]
+          : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis->buffer())[0]);
+
+  auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_value);
 
   output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
@@ -402,19 +408,24 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op)
   // check if output is not dynamic
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
-  auto input_ind = op.getInputs().at(ir::operation::Fill::Input::INPUT);
-  auto input = _tensor_registry->getITensor(input_ind);
-  ir::Shape input_shape = input->getShape();
+  auto shape_ind = op.getInputs().at(ir::operation::Fill::Input::SHAPE);
+  auto shape = _tensor_registry->getITensor(shape_ind);
 
-  if ((!input->is_dynamic()) && (!output->is_dynamic()))
+  if ((!shape->is_dynamic()) && (!output->is_dynamic()))
     return;
 
-  assert(input->data_type() == ir::DataType::INT32);
+  const auto dims_type = shape->data_type();
+  assert(dims_type == ir::DataType::INT32 || dims_type == ir::DataType::INT64);
 
-  auto input_buf = reinterpret_cast<const int32_t *>(input->buffer());
-  assert(input_buf);
+  auto dims_buf = shape->buffer();
+  assert(dims_buf);
 
-  auto output_shape = shape_inference::inferFillShape(input_shape, input_buf);
+  const auto &dims_shape = shape->getShape();
+  auto output_shape = ((dims_type == ir::DataType::INT32)
+                           ? shape_inference::inferFillShape<int32_t>(
+                                 dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
+                           : shape_inference::inferFillShape<int64_t>(
+                                 dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
 
   output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
diff --git a/runtime/onert/core/src/exec/ExecTime.h b/runtime/onert/core/src/exec/ExecTime.h
index 846d093..d2ddbad 100644
--- a/runtime/onert/core/src/exec/ExecTime.h
+++ b/runtime/onert/core/src/exec/ExecTime.h
@@ -94,7 +94,7 @@ public:
   /**
    * @brief Update metrics file with new data.
    */
-  void uploadOperationsExecTime() const { _json.uploadOperationsExecTime(); }
+  void storeOperationsExecTime() const { _json.storeOperationsExecTime(); }
   static const int64_t NOT_FOUND = -1;
 
 private:
diff --git a/runtime/onert/core/src/exec/ExecutionObservee.cc b/runtime/onert/core/src/exec/ExecutionObservee.cc
index ddb1fb6..d5003b1 100644
--- a/runtime/onert/core/src/exec/ExecutionObservee.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservee.cc
@@ -26,37 +26,38 @@ void ExecutionObservee::add(std::unique_ptr<IExecutionObserver> observer)
   _observers.emplace_back(std::move(observer));
 }
 
-void ExecutionObservee::notifyModelBegin(IExecutor *executor)
+void ExecutionObservee::notifySubgraphBegin(ir::SubgraphIndex ind)
 {
   for (auto &o : _observers)
   {
-    o->handleBegin(executor);
+    o->handleSubgraphBegin(ind);
   }
 }
 
-void ExecutionObservee::notifyModelEnd(IExecutor *executor)
+void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind)
 {
   for (auto &o : _observers)
   {
-    o->handleEnd(executor);
+    o->handleSubgraphEnd(ind);
   }
 }
 
-void ExecutionObservee::notifyJobBegin(IExecutor *executor, const ir::OpSequence *op_seq,
+void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex index,
+                                       const ir::OpSequence *op_seq,
                                        const backend::Backend *backend)
 {
   for (auto &o : _observers)
   {
-    o->handleBegin(executor, op_seq, backend);
+    o->handleJobBegin(executor, index, op_seq, backend);
   }
 }
 
-void ExecutionObservee::notifyJobEnd(IExecutor *executor, const ir::OpSequence *op_seq,
-                                     const backend::Backend *backend)
+void ExecutionObservee::notifyJobEnd(IExecutor *executor, ir::SubgraphIndex index,
+                                     const ir::OpSequence *op_seq, const backend::Backend *backend)
 {
   for (auto &o : _observers)
   {
-    o->handleEnd(executor, op_seq, backend);
+    o->handleJobEnd(executor, index, op_seq, backend);
   }
 }
 
diff --git a/runtime/onert/core/src/exec/ExecutionObservee.h b/runtime/onert/core/src/exec/ExecutionObservee.h
index 49d409a..62b3f62 100644
--- a/runtime/onert/core/src/exec/ExecutionObservee.h
+++ b/runtime/onert/core/src/exec/ExecutionObservee.h
@@ -20,6 +20,7 @@
 #include <list>
 
 #include "exec/ExecutionObservers.h"
+#include "ir/Index.h"
 
 namespace onert
 {
@@ -39,11 +40,11 @@ public:
    * @param observer Observer to be added
    */
   void add(std::unique_ptr<IExecutionObserver> observer);
-  void notifyModelBegin(IExecutor *executor);
-  void notifyModelEnd(IExecutor *executor);
-  void notifyJobBegin(IExecutor *executor, const ir::OpSequence *op_seq,
+  void notifySubgraphBegin(ir::SubgraphIndex ind);
+  void notifySubgraphEnd(ir::SubgraphIndex ind);
+  void notifyJobBegin(IExecutor *executor, ir::SubgraphIndex index, const ir::OpSequence *op_seq,
                       const backend::Backend *backend);
-  void notifyJobEnd(IExecutor *executor, const ir::OpSequence *op_seq,
+  void notifyJobEnd(IExecutor *executor, ir::SubgraphIndex index, const ir::OpSequence *op_seq,
                     const backend::Backend *backend);
 
 private:
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.cc b/runtime/onert/core/src/exec/ExecutionObservers.cc
index 066b52e..18c0c1d 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservers.cc
@@ -17,12 +17,62 @@
 #include "exec/ExecutionObservers.h"
 
 #include <string>
+#include <sstream>
 
 #include "util/logging.h"
 #include "exec/IExecutor.h"
 #include "misc/polymorphic_downcast.h"
 #include "ir/OpSequence.h"
 #include "util/EventWriter.h"
+#include "util/Utils.h"
+
+namespace
+{
+
+void setUserData(const onert::ir::Graph &g, const onert::ir::OpSequence *op_seq,
+                 decltype(EventCollector::Event::userData) &data)
+{
+  if (op_seq->size() == 0)
+    return;
+
+  // From a tensor of shape [a, b, c], this will return a string "shape(a b c)".
+  // String like "[1, 2, 3]" looks better but this will be considered as a list in Json
+  // so text search (e.g., Ctrl-F in Chrome Tracing) could be difficult
+  auto build_shape_str = [&](onert::ir::OperandIndex operand_idx) {
+    std::string shape_str;
+    auto &shape = g.operands().at(operand_idx).info().shape();
+    for (int i = 0; i < shape.rank(); i++)
+    {
+      if (i == 0)
+        shape_str = "shape(" + std::to_string(shape.dim(i));
+      else
+        shape_str += " " + std::to_string(shape.dim(i));
+    }
+    shape_str += ")";
+
+    return shape_str;
+  };
+
+  const auto &first_op_idx = op_seq->operations().at(0);
+  const auto &first_op_node = g.operations().at(first_op_idx);
+
+  auto &inputs = first_op_node.getInputs();
+  auto size = inputs.size();
+  for (size_t i = 0; i < size; i++)
+  {
+    auto operand_idx = inputs.at(i);
+    if (operand_idx.undefined())
+      continue;
+
+    std::string key("input_shape_" + std::to_string(i));
+    std::string value = build_shape_str(operand_idx);
+    data.emplace_back(std::make_pair(key, value));
+  }
+
+  // add other userData as needed
+}
+
+} // namespace
 
 namespace onert
 {
@@ -30,8 +80,8 @@ namespace onert
 namespace exec
 {
 
-void ProfileObserver::handleBegin(onert::exec::IExecutor *, const ir::OpSequence *,
-                                  const onert::backend::Backend *backend)
+void ProfileObserver::handleJobBegin(onert::exec::IExecutor *, ir::SubgraphIndex,
+                                     const ir::OpSequence *, const onert::backend::Backend *backend)
 {
   _timer = backend->config()->timer();
   if (_timer == nullptr)
@@ -39,8 +89,8 @@ void ProfileObserver::handleBegin(onert::exec::IExecutor *, const ir::OpSequence
   _timer->handleBegin();
 }
 
-void ProfileObserver::handleEnd(IExecutor *exec, const ir::OpSequence *op_seq,
-                                const backend::Backend *backend)
+void ProfileObserver::handleJobEnd(IExecutor *exec, ir::SubgraphIndex, const ir::OpSequence *op_seq,
+                                   const backend::Backend *backend)
 {
   _timer->handleEnd();
   const auto timer_res = _timer->getTime();
@@ -70,51 +120,74 @@ void ProfileObserver::handleEnd(IExecutor *exec, const ir::OpSequence *op_seq,
   }
 };
 
-ChromeTracingObserver::ChromeTracingObserver(const std::string &filepath, const ir::Graph &graph)
-    : _base_filepath(filepath), _recorder{}, _collector{&_recorder}, _graph{graph}
+TracingObserver::TracingObserver(const std::string &filepath, const ir::Graph &graph,
+                                 const util::TracingCtx *tracing_ctx)
+    : _recorder{std::make_unique<EventRecorder>()}, _collector{_recorder.get()}, _graph{graph},
+      _tracing_ctx{tracing_ctx}
 {
+  // TODO Remove below after using _tracing_ctx
+  UNUSED_RELEASE(_tracing_ctx);
+
+  _event_writer = EventWriter::get(filepath);
+  _event_writer->startToUse();
 }
 
-ChromeTracingObserver::~ChromeTracingObserver()
+TracingObserver::~TracingObserver()
 {
   try
   {
-    EventWriter{_recorder}.writeToFiles(_base_filepath);
+    _event_writer->readyToFlush(std::move(_recorder));
   }
   catch (const std::exception &e)
   {
-    std::cerr << "E: Fail to record event in ChromeTracingObserver: " << e.what() << std::endl;
+    std::cerr << "E: Fail to record event in TracingObserver: " << e.what() << std::endl;
   }
 }
 
-void ChromeTracingObserver::handleBegin(IExecutor *)
+void TracingObserver::handleSubgraphBegin(ir::SubgraphIndex subg_ind)
 {
+  // TODO Write subg_ind into profling result
+  UNUSED_RELEASE(subg_ind);
   _collector.onEvent(EventCollector::Event{EventCollector::Edge::BEGIN, "runtime", "Graph"});
 }
 
-void ChromeTracingObserver::handleBegin(IExecutor *, const ir::OpSequence *op_seq,
-                                        const backend::Backend *backend)
+void TracingObserver::handleJobBegin(IExecutor *, ir::SubgraphIndex subg_ind,
+                                     const ir::OpSequence *op_seq, const backend::Backend *backend)
 {
+  // TODO Write subg_ind into profling result
+  UNUSED_RELEASE(subg_ind);
+
   std::string backend_id = backend->config()->id();
-  _collector.onEvent(EventCollector::Event{EventCollector::Edge::BEGIN, backend_id,
-                                           opSequenceTag(op_seq, _graph.operations())});
+
+  auto ev = EventCollector::Event{EventCollector::Edge::BEGIN, backend_id,
+                                  opSequenceTag(op_seq, _graph.operations())};
+  // add shape of inputs
+  setUserData(_graph, op_seq, ev.userData);
+
+  _collector.onEvent(ev);
 }
 
-void ChromeTracingObserver::handleEnd(IExecutor *, const ir::OpSequence *op_seq,
-                                      const backend::Backend *backend)
+void TracingObserver::handleJobEnd(IExecutor *, ir::SubgraphIndex subg_ind,
+                                   const ir::OpSequence *op_seq, const backend::Backend *backend)
 {
+  // TODO Write subg_ind into profling result
+  UNUSED_RELEASE(subg_ind);
+
   std::string backend_id = backend->config()->id();
   _collector.onEvent(EventCollector::Event{EventCollector::Edge::END, backend_id,
                                            opSequenceTag(op_seq, _graph.operations())});
 }
 
-void ChromeTracingObserver::handleEnd(IExecutor *)
+void TracingObserver::handleSubgraphEnd(ir::SubgraphIndex subg_ind)
 {
+  // TODO Write subg_ind into profling result
+  UNUSED_RELEASE(subg_ind);
+
   _collector.onEvent(EventCollector::Event{EventCollector::Edge::END, "runtime", "Graph"});
 }
 
-std::string ChromeTracingObserver::opSequenceTag(const ir::OpSequence *op_seq,
-                                                 const ir::Operations &operations)
+std::string TracingObserver::opSequenceTag(const ir::OpSequence *op_seq,
+                                           const ir::Operations &operations)
 {
   if (op_seq->size() == 0)
     return "Empty OpSequence";
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h
index f8c2acc..a9eebfe 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.h
+++ b/runtime/onert/core/src/exec/ExecutionObservers.h
@@ -18,12 +18,16 @@
 #define __ONERT_EXEC_OBSREVERS_H__
 
 #include "exec/IFunction.h"
+#include "ir/Index.h"
 #include "ir/OpSequence.h"
 #include "ExecTime.h"
 #include "util/ITimer.h"
 #include "exec/IExecutor.h"
 #include "util/EventCollector.h"
 #include "util/EventRecorder.h"
+#include "util/EventWriter.h"
+#include "util/TracingCtx.h"
+#include "util/EventWriter.h"
 
 namespace onert
 {
@@ -33,13 +37,15 @@ class IExecutionObserver
 {
 public:
   /// @brief Invoked just before model (not individual operation) execution begins
-  virtual void handleBegin(IExecutor *) { return; }
+  virtual void handleSubgraphBegin(ir::SubgraphIndex) { return; }
 
-  virtual void handleBegin(IExecutor *, const ir::OpSequence *, const backend::Backend *) = 0;
-  virtual void handleEnd(IExecutor *, const ir::OpSequence *, const backend::Backend *) = 0;
+  virtual void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                              const backend::Backend *) = 0;
+  virtual void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                            const backend::Backend *) = 0;
 
   /// @brief Invoked just after model (not individual operation) execution ends
-  virtual void handleEnd(IExecutor *) { return; }
+  virtual void handleSubgraphEnd(ir::SubgraphIndex) { return; }
 
   virtual ~IExecutionObserver() = default;
 };
@@ -51,10 +57,12 @@ public:
       : _et(std::move(et)), _graph(graph)
   {
   }
-  void handleBegin(IExecutor *, const ir::OpSequence *, const backend::Backend *) override;
-  void handleEnd(IExecutor *, const ir::OpSequence *, const backend::Backend *) override;
+  void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                      const backend::Backend *) override;
+  void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                    const backend::Backend *) override;
 
-  void handleEnd(IExecutor *) override { _et->uploadOperationsExecTime(); }
+  void handleSubgraphEnd(ir::SubgraphIndex) override { _et->storeOperationsExecTime(); }
 
 private:
   std::unique_ptr<util::ITimer> _timer;
@@ -62,24 +70,28 @@ private:
   const ir::Graph &_graph;
 };
 
-class ChromeTracingObserver : public IExecutionObserver
+class TracingObserver : public IExecutionObserver
 {
 public:
-  ChromeTracingObserver(const std::string &filepath, const ir::Graph &graph);
-  ~ChromeTracingObserver();
-  void handleBegin(IExecutor *) override;
-  void handleBegin(IExecutor *, const ir::OpSequence *, const backend::Backend *) override;
-  void handleEnd(IExecutor *, const ir::OpSequence *, const backend::Backend *) override;
-  void handleEnd(IExecutor *) override;
+  TracingObserver(const std::string &filepath, const ir::Graph &graph,
+                  const util::TracingCtx *tracing_ctx);
+  ~TracingObserver();
+  void handleSubgraphBegin(ir::SubgraphIndex) override;
+  void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                      const backend::Backend *) override;
+  void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+                    const backend::Backend *) override;
+  void handleSubgraphEnd(ir::SubgraphIndex) override;
 
 private:
   static std::string opSequenceTag(const ir::OpSequence *op_seq, const ir::Operations &operations);
 
 private:
-  const std::string &_base_filepath;
-  EventRecorder _recorder;
+  std::unique_ptr<EventRecorder> _recorder;
   EventCollector _collector;
   const ir::Graph &_graph;
+  EventWriter *_event_writer;
+  const util::TracingCtx *_tracing_ctx;
 };
 
 } // namespace exec
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index 018a0bb..588a325 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -15,11 +15,11 @@
  */
 
 #include "ExecutorBase.h"
+#include "ShapeConverter.h"
 
-#include "backend/ITensor.h"
 #include "backend/controlflow/UserTensor.h"
-#include "backend/cpu_common/Tensor.h"
 #include "util/logging.h"
+#include "misc/polymorphic_downcast.h"
 
 namespace onert
 {
@@ -27,43 +27,27 @@ namespace exec
 {
 
 ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-                           const std::vector<backend::ITensor *> &input_tensors,
-                           const std::vector<backend::ITensor *> &output_tensors,
-                           const compiler::TensorRegistries &tensor_regs)
-    : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()},
-      _input_tensors{input_tensors}, _output_tensors{output_tensors}, _mutex()
+                           const compiler::TensorRegistries &tensor_regs,
+                           const util::TracingCtx *tracing_ctx)
+    : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()}, _mutex(),
+      _tracing_ctx(tracing_ctx)
 {
-  // TODO Fix the way of knowing whether it is primary or not
-  bool primary_executor = !(_input_tensors.empty() && _output_tensors.empty());
-  if (!primary_executor)
-  {
-    auto build_input_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<backend::ITensor *> list;
-      for (auto ind : ind_seq)
-      {
-        backend::ITensor *tensor = tensor_regs.getITensor(ind);
-        assert(tensor != nullptr);
-        list.push_back(tensor);
-      }
-      return list;
-    };
-    auto build_output_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<backend::ITensor *> list;
-      for (auto ind : ind_seq)
-      {
-        backend::ITensor *tensor = tensor_regs.getITensor(ind);
-        assert(tensor != nullptr);
-        list.push_back(tensor);
-      }
-      return list;
-    };
-    _input_tensors = build_input_tensor_list(_graph.getInputs());
-    _output_tensors = build_output_tensor_list(_graph.getOutputs());
-  }
+  auto build_tensor_list = [&](const auto &ind_seq, auto &tensors) {
+    assert(tensors.empty());
+    for (auto ind : ind_seq)
+    {
+      backend::ITensor *tensor = tensor_regs.getITensor(ind);
+      assert(tensor != nullptr);
+      auto io_tensor = nnfw::misc::polymorphic_downcast<backend::controlflow::IOTensor *>(tensor);
+      tensors.push_back(io_tensor);
+    }
+  };
+  build_tensor_list(_graph.getInputs(), _input_tensors);
+  build_tensor_list(_graph.getOutputs(), _output_tensors);
 }
 
-void ExecutorBase::execute(const std::vector<backend::ITensor *> &src_tensors,
-                           const std::shared_ptr<IPermuteFunction> &pre_fn)
+void ExecutorBase::execute(const std::vector<backend::IPortableTensor *> &inputs,
+                           const std::vector<backend::IPortableTensor *> &outputs)
 {
   // For thread-safe, use mutex
   // TODO: if all used backends on this executor are thread-safe,
@@ -71,31 +55,37 @@ void ExecutorBase::execute(const std::vector<backend::ITensor *> &src_tensors,
   // Deadlock occurs when an Executor is called recursively.
   std::lock_guard<std::mutex> lock(_mutex);
 
-  assert(src_tensors.size() == _graph.getInputs().size());
-  assert(src_tensors.size() == _input_tensors.size());
-  for (uint32_t n = 0; n < _graph.getInputs().size(); ++n)
+  assert(inputs.size() == _graph.getInputs().size());
+  assert(inputs.size() == _input_tensors.size());
+  for (uint32_t n = 0; n < inputs.size(); ++n)
   {
-    // when user changes input shape, the input tensor is dynamic and its memory is not allocated.
-    // This code find the info to allocate dynamic tensor, and allocate memory based on the source
-    // tensor's shape set by caller.
-    const auto src_tensor = src_tensors[n];
+    const auto input = inputs[n];
+    assert(input->buffer() != nullptr);
     auto input_tensor = _input_tensors[n];
-    // If src_tensor or input_tensor is nullptr, pre_fn does not copy the tensors
-    if (src_tensor != nullptr && input_tensor != nullptr)
+    assert(input_tensor != nullptr);
+    if (input != nullptr)
     {
-      const auto orig_input_shape = input_tensor->getShape();
+      const auto orig_input_shape = input_tensor->orig_info().shape();
       const auto changed_input_shape =
-          convertShape(src_tensor->getShape(), src_tensor->layout(), input_tensor->layout());
+          convertShape(input->getShape(), input->layout(), input_tensor->orig_layout());
       if (orig_input_shape != changed_input_shape)
       {
         input_tensor->set_dynamic();
       }
     }
+    input_tensor->setTensor(input);
   }
 
-  // TODO Move calling permute_fn.run() into executeImpl()
-  assert(pre_fn);
-  pre_fn->run();
+  assert(outputs.size() == _graph.getOutputs().size());
+  assert(outputs.size() == _output_tensors.size());
+  for (uint32_t n = 0; n < outputs.size(); ++n)
+  {
+    const auto output = outputs[n];
+    // assert(dst_tensor->buffer() != nullptr);
+    auto output_tensor = _output_tensors[n];
+    assert(output_tensor != nullptr);
+    output_tensor->setTensor(output);
+  }
 
   executeImpl();
 }
@@ -111,19 +101,19 @@ void ExecutorBase::execute(const IODescription &desc)
   assert(_input_tensors.size() == desc.inputs.size());
   for (uint32_t i = 0; i < _input_tensors.size(); ++i)
   {
-    // TODO Remove dynamic_cast
-    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_input_tensors[i]);
-    assert(tensor);
+    auto tensor = _input_tensors[i];
+
+    // TODO Check if (desc.inputs[i] == nullptr)
+    // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
+    tensor->setUserTensor(static_cast<uint8_t *>(const_cast<void *>(desc.inputs[i]->buffer)),
+                          desc.inputs[i]->size);
+
     auto input_shape = desc.dynamic_input_shapes.find(ir::IOIndex{i});
     if (input_shape != desc.dynamic_input_shapes.end())
     {
       tensor->set_dynamic();
       tensor->setShape(input_shape->second);
     }
-    // TODO Check if (desc.inputs[i] == nullptr)
-    // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
-    tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.inputs[i]->buffer)),
-                      desc.inputs[i]->size);
 
     handleDynamicInputTensor(ir::IOIndex{i}, desc);
   }
@@ -131,13 +121,12 @@ void ExecutorBase::execute(const IODescription &desc)
   assert(_output_tensors.size() == desc.outputs.size());
   for (uint32_t i = 0; i < _output_tensors.size(); ++i)
   {
-    // TODO Remove dynamic_cast
-    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_output_tensors[i]);
-    assert(tensor);
-    tensor->set_dynamic(); // It can't be resized but shape could change
+    auto tensor = _output_tensors[i];
+
     if (desc.outputs[i] == nullptr)
       throw std::runtime_error{"Output " + std::to_string(i) + "'s buffer is not set."};
-    tensor->setBuffer(static_cast<uint8_t *>(desc.outputs[i]->buffer), desc.outputs[i]->size);
+    tensor->setUserTensor(static_cast<uint8_t *>(desc.outputs[i]->buffer), desc.outputs[i]->size);
+    tensor->set_dynamic(); // It can't be resized but shape could change
   }
 
   executeImpl();
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index 8a6ec91..5d95c10 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -17,23 +17,25 @@
 #ifndef __ONERT_EXEC_EXECUTOR_BASE_H__
 #define __ONERT_EXEC_EXECUTOR_BASE_H__
 
-#include <mutex>
-
 #include "IPermuteFunction.h"
-#include "exec/ExecutionObservers.h"
-#include "ShapeConverter.h"
 #include "exec/IExecutor.h"
-#include "compiler/LoweredGraph.h"
-#include "ir/LowerInfoMap.h"
-#include "backend/IConfig.h"
-#include "backend/Backend.h"
 #include "exec/ExecTime.h"
-#include "exec/IFunction.h"
-#include "backend/IDynamicTensorManager.h"
-#include "backend/ITensorManager.h"
 #include "exec/ExecutionObservee.h"
+#include "exec/IFunction.h"
+#include "exec/IODescription.h"
+#include "ir/Graph.h"
+#include "ir/Index.h"
+#include "ir/LowerInfoMap.h"
+#include "ir/OperationIndexMap.h"
+#include "compiler/LoweredGraph.h"
 #include "compiler/TensorRegistries.h"
-#include <list>
+#include "backend/controlflow/IOTensor.h"
+#include "util/TracingCtx.h"
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
 
 namespace onert
 {
@@ -49,25 +51,17 @@ public:
    * @param tensor_builders Tensor builders that are currently used
    */
   ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-               const std::vector<backend::ITensor *> &input_tensors,
-               const std::vector<backend::ITensor *> &output_tensors,
-               const compiler::TensorRegistries &tensor_regs);
+               const compiler::TensorRegistries &tensor_regs, const util::TracingCtx *tracing_ctx);
 
   virtual ~ExecutorBase() = default;
 
   const ir::Graph &graph() final { return _graph; }
 
-  /**
-   * @brief Execute without IODescription
-   *
-   * @param src_tensor Tensor list that will be copied to input tensors of this
-   * @param pre_fn The permutation function that copy from src_tensor to input tensors of this
-   */
-  void execute(const std::vector<backend::ITensor *> &src_tensors,
-               const std::shared_ptr<IPermuteFunction> &pre_fn);
-
   void execute(const IODescription &desc) final;
 
+  void execute(const std::vector<backend::IPortableTensor *> &inputs,
+               const std::vector<backend::IPortableTensor *> &outputs) override;
+
   // Used only in Dataflow and Parallel Executors
   void setIndexedRanks(std::shared_ptr<ir::OperationIndexMap<int64_t>> ranks) final
   {
@@ -78,9 +72,10 @@ public:
 
   void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); };
 
-  const std::vector<backend::ITensor *> &getInputTensors() const { return _input_tensors; }
-
-  const std::vector<backend::ITensor *> &getOutputTensors() const { return _output_tensors; }
+  const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const override
+  {
+    return _output_tensors;
+  }
 
 protected:
   /**
@@ -93,9 +88,10 @@ protected:
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
   std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
   const ir::Graph &_graph;
-  std::vector<backend::ITensor *> _input_tensors;
-  std::vector<backend::ITensor *> _output_tensors;
+  std::vector<backend::controlflow::IOTensor *> _input_tensors;
+  std::vector<backend::controlflow::IOTensor *> _output_tensors;
   std::mutex _mutex;
+  const util::TracingCtx *_tracing_ctx;
 
 private:
   void handleDynamicInputTensor(ir::IOIndex input_index, const IODescription &desc);
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h
index 11017ed..8f62156 100644
--- a/runtime/onert/core/src/exec/IPermuteFunction.h
+++ b/runtime/onert/core/src/exec/IPermuteFunction.h
@@ -120,7 +120,8 @@ protected:
     }
 
     assert(src_tensor != dst_tensor);
-    assert(underlying_type(src_tensor->data_type()) == underlying_type(dst_tensor->data_type()));
+    if (underlying_type(src_tensor->data_type()) != underlying_type(dst_tensor->data_type()))
+      throw std::runtime_error("data type does not match");
     switch (src_tensor->data_type())
     {
       case ir::DataType::FLOAT32:
diff --git a/runtime/onert/core/src/exec/JSONExecTime.cc b/runtime/onert/core/src/exec/JSONExecTime.cc
index 72a18de..b29216a 100644
--- a/runtime/onert/core/src/exec/JSONExecTime.cc
+++ b/runtime/onert/core/src/exec/JSONExecTime.cc
@@ -135,7 +135,7 @@ void JSON::printOperation(const std::map<uint32_t, int64_t> &operation_info,
   stream.seekp(-2, std::ofstream::end);
 }
 
-void JSON::uploadOperationsExecTime() const
+void JSON::storeOperationsExecTime() const
 {
   std::ofstream stream(_measurement_file);
   if (!stream.is_open())
diff --git a/runtime/onert/core/src/exec/JSONExecTime.h b/runtime/onert/core/src/exec/JSONExecTime.h
index a64cb31..8987d72 100644
--- a/runtime/onert/core/src/exec/JSONExecTime.h
+++ b/runtime/onert/core/src/exec/JSONExecTime.h
@@ -54,18 +54,16 @@ public:
     loadOperationsExecTime();
   };
   /**
-   * @brief Update _operations_exec_time_file with new data.
+   * @brief Update _measurement_file with new data.
    */
-  void uploadOperationsExecTime() const;
+  void storeOperationsExecTime() const;
 
 private:
   ///@brief file containing measurements
   std::string _measurement_file;
   std::unordered_map<std::string, const backend::Backend *> _backends;
-  std::unordered_map<
-      const backend::Backend *,
-      std::unordered_map<std::string, std::unordered_map<bool, std::map<uint32_t, int64_t>>>>
-      &_measurements;
+  MeasurementData &_measurements;
+
   /**
    * @brief Helper function for inserting data to OperationExecTimes
    *
@@ -86,7 +84,7 @@ private:
   void printOperation(const std::map<uint32_t, int64_t> &operation_info,
                       std::ofstream &stream) const;
   /**
-   * @brief Parse and load operations_exec_time from _operations_exec_time_file.
+   * @brief Parse and load _measurements from _measurement_file.
    */
   void loadOperationsExecTime();
 };
diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc
index 6e6ca11..a6d4473 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.cc
+++ b/runtime/onert/core/src/exec/LinearExecutor.cc
@@ -39,7 +39,9 @@ char *seq_to_label(const onert::ir::OpSequence *op_seq, const onert::ir::Operati
 
 void LinearExecutor::executeImpl()
 {
-  _subject.notifyModelBegin(this);
+  auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
+
+  _subject.notifySubgraphBegin(profiling_subg_index);
   for (auto &&code : _code)
   {
     const auto op_seq = code.op_seq;
@@ -48,7 +50,7 @@ void LinearExecutor::executeImpl()
 #ifdef RUY_PROFILER
     ruy::profiler::ScopeLabel label(seq_to_label(op_seq, _graph.operations()));
 #endif
-    _subject.notifyJobBegin(this, op_seq, backend);
+    _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
 
     auto &fn_seq = code.fn_seq;
 
@@ -58,9 +60,9 @@ void LinearExecutor::executeImpl()
     fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
     fn_seq->run();
 
-    _subject.notifyJobEnd(this, op_seq, backend);
+    _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
   }
-  _subject.notifyModelEnd(this);
+  _subject.notifySubgraphEnd(profiling_subg_index);
 }
 
 } // namespace exec
diff --git a/runtime/onert/core/src/exec/LinearExecutor.h b/runtime/onert/core/src/exec/LinearExecutor.h
index 22d00ec..d43c970 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.h
+++ b/runtime/onert/core/src/exec/LinearExecutor.h
@@ -27,6 +27,7 @@
 #include "compiler/Linear.h"
 #include "exec/FunctionSequence.h"
 #include "compiler/CodeMap.h"
+#include "util/TracingCtx.h"
 
 namespace onert
 {
@@ -47,11 +48,9 @@ public:
    * @param code_map OpSequence and its code map
    */
   LinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                 const std::vector<backend::ITensor *> &input_tensors,
-                 const std::vector<backend::ITensor *> &output_tensors,
                  const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
-                 const std::vector<ir::OpSequenceIndex> &order)
-      : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs}
+                 const std::vector<ir::OpSequenceIndex> &order, const util::TracingCtx *tracing_ctx)
+      : ExecutorBase{std::move(lowered_graph), tensor_regs, tracing_ctx}
   {
     for (auto index : order)
     {
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.cc b/runtime/onert/core/src/exec/ParallelExecutor.cc
index 676bdb5..e9e576c 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.cc
+++ b/runtime/onert/core/src/exec/ParallelExecutor.cc
@@ -60,12 +60,10 @@ void ParallelExecutor::notify(uint32_t finished_job_id)
 }
 
 ParallelExecutor::ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                                   const std::vector<backend::ITensor *> &input_tensors,
-                                   const std::vector<backend::ITensor *> &output_tensors,
                                    const compiler::TensorRegistries &tensor_regs,
-                                   compiler::CodeMap &&code_map)
-    : DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                       std::move(code_map)}
+                                   compiler::CodeMap &&code_map,
+                                   const util::TracingCtx *tracing_ctx)
+    : DataflowExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map), tracing_ctx}
 {
   VERBOSE(ParallelExecutor) << "Constructing Parallel Executor" << std::endl;
 }
@@ -100,7 +98,10 @@ void ParallelExecutor::executeImpl()
 
   VERBOSE(ParallelExecutor) << "INITIAL JOBS : " << _ready_jobs.size() << std::endl;
 
-  _subject.notifyModelBegin(this);
+  auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
+
+  _subject.notifySubgraphBegin(profiling_subg_index);
+
   while (true)
   {
     std::unique_lock<std::mutex> lock{_mu_jobs};
@@ -126,9 +127,11 @@ void ParallelExecutor::executeImpl()
     auto op_sequence_index = _job_to_op_seq[job_index];
     auto op_seq = &_lowered_graph->op_seqs().at(op_sequence_index);
     auto backend = _lowered_graph->getLowerInfo()->op_seq.at(op_sequence_index)->backend();
-    auto setup = [&, op_seq, backend]() { _subject.notifyJobBegin(this, op_seq, backend); };
+    auto setup = [&, op_seq, backend]() {
+      _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
+    };
     auto teardown = [&, job_index, op_seq, backend]() {
-      _subject.notifyJobEnd(this, op_seq, backend);
+      _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
       notify(job_index);
     };
 
@@ -146,7 +149,7 @@ void ParallelExecutor::executeImpl()
 
   // Wait for all the jobs done
   _scheduler->finish();
-  _subject.notifyModelEnd(this);
+  _subject.notifySubgraphEnd(profiling_subg_index);
 
   // Reset input info for the next execution
   _input_info = _initial_input_info;
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.h b/runtime/onert/core/src/exec/ParallelExecutor.h
index 111c20c..fd9db42 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.h
+++ b/runtime/onert/core/src/exec/ParallelExecutor.h
@@ -28,6 +28,7 @@
 #include <memory>
 #include "exec/DataflowExecutor.h"
 #include "ParallelScheduler.h"
+#include "util/TracingCtx.h"
 
 namespace onert
 {
@@ -51,9 +52,8 @@ public:
    * @param code_map OpSequence and its code map
    */
   ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<backend::ITensor *> &input_tensors,
-                   const std::vector<backend::ITensor *> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
+                   const util::TracingCtx *tracing_ctx);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/interp/InterpExecutor.h b/runtime/onert/core/src/interp/InterpExecutor.h
index 2e3f3ca..99d7b3a 100644
--- a/runtime/onert/core/src/interp/InterpExecutor.h
+++ b/runtime/onert/core/src/interp/InterpExecutor.h
@@ -58,6 +58,15 @@ public:
    * @note   It should be called after setting input and output buffer
    */
   void execute(const exec::IODescription &desc) final;
+  void execute(const std::vector<backend::IPortableTensor *> &,
+               const std::vector<backend::IPortableTensor *> &) final
+  {
+    throw new std::runtime_error{"Interpreter does not support subgraph calls(control flow ops)"};
+  }
+  const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const final
+  {
+    throw new std::runtime_error{"Interpreter does not support this function."};
+  }
 
 private:
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
index 0473855..e1fb767 100644
--- a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
+++ b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
@@ -116,7 +116,7 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
   float *ofm_ptr = reinterpret_cast<float *>(ofm_tensor->buffer());
 
   nnfw::cker::DepthwiseConv(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr,
-                            cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr);
+                            cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr, nullptr);
 }
 
 void invokeDepthwiseConv(const ExecEnv *env, const ir::Operation &node)
diff --git a/runtime/onert/core/src/ir/DataType.cc b/runtime/onert/core/src/ir/DataType.cc
index 9eedcd2..8e75c4f 100644
--- a/runtime/onert/core/src/ir/DataType.cc
+++ b/runtime/onert/core/src/ir/DataType.cc
@@ -42,6 +42,7 @@ size_t sizeOfDataType(DataType data_type)
       return sizeof(uint8_t);
     case DataType::QUANT_INT8_SYMM:
     case DataType::QUANT_INT8_ASYMM:
+    case DataType::QUANT_INT8_SYMM_PER_CHANNEL:
       return sizeof(int8_t);
     case DataType::FLOAT16:
       return sizeof(float16);
diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc
index eecfe81..a8578b4 100644
--- a/runtime/onert/core/src/ir/OperationDumper.cc
+++ b/runtime/onert/core/src/ir/OperationDumper.cc
@@ -72,7 +72,14 @@ OperationDumper::OperationDumper(const std::string &start_msg)
   VERBOSE(LIR) << start_msg << std::endl;
 }
 
-void OperationDumper::visit(const ArgMax &node) { dumpBinaryInputOp(node); }
+void OperationDumper::visit(const ArgMinMax &node)
+{
+  std::string min_max = node.param().is_arg_max ? "(Max)" : "(Min)";
+  VERBOSE(LIR) << "* " << node.name() << min_max << std::endl;
+  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(ArgMinMax::INPUT) << ") Axis("
+               << node.getInputs().at(ArgMinMax::AXIS) << ") " << std::endl;
+  VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
+}
 
 void OperationDumper::visit(const BatchToSpaceND &node)
 {
@@ -159,6 +166,14 @@ void OperationDumper::visit(const ExpandDims &node)
   dumpUnaryInputOp(node, axis);
 }
 
+void OperationDumper::visit(const Fill &node)
+{
+  VERBOSE(LIR) << "* " << node.name() << std::endl;
+  VERBOSE(LIR) << "  - Inputs : Shape(" << node.getInputs().at(Fill::Input::SHAPE) << ") Value("
+               << node.getInputs().at(Fill::Input::VALUE) << ")" << std::endl;
+  VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
+}
+
 void OperationDumper::visit(const FullyConnected &node)
 {
   std::string inputs =
@@ -505,7 +520,7 @@ void OperationDumper::visit(const While &node)
   }
   VERBOSE(LIR) << "  - Inputs : "
                << "Cond subgraph (" << node.param().cond_subg_index << ") Body subgraph ("
-               << node.param().cond_subg_index << ") Inputs(" << inputs << ")" << std::endl;
+               << node.param().body_subg_index << ") Inputs(" << inputs << ")" << std::endl;
   std::string outputs;
   const auto &output_indices = node.getOutputs();
   for (auto it = std::begin(output_indices); it != std::end(output_indices); ++it)
diff --git a/runtime/onert/core/src/ir/OperationDumper.h b/runtime/onert/core/src/ir/OperationDumper.h
index 91642ab..fe18307 100644
--- a/runtime/onert/core/src/ir/OperationDumper.h
+++ b/runtime/onert/core/src/ir/OperationDumper.h
@@ -31,7 +31,7 @@ public:
   OperationDumper(const std::string &start_msg);
 
 public:
-  void visit(const operation::ArgMax &) override;
+  void visit(const operation::ArgMinMax &) override;
   void visit(const operation::BatchToSpaceND &node) override;
   void visit(const operation::BCQFullyConnected &node) override;
   void visit(const operation::BinaryArithmetic &node) override;
@@ -48,6 +48,7 @@ public:
   void visit(const operation::ElementwiseUnary &) override;
   void visit(const operation::EmbeddingLookup &) override;
   void visit(const operation::ExpandDims &) override;
+  void visit(const operation::Fill &) override;
   void visit(const operation::FullyConnected &node) override;
   void visit(const operation::Gather &) override;
   void visit(const operation::HashtableLookup &) override;
diff --git a/runtime/onert/core/src/ir/OperationValidator.cc b/runtime/onert/core/src/ir/OperationValidator.cc
index da08e81..6f81c2a 100644
--- a/runtime/onert/core/src/ir/OperationValidator.cc
+++ b/runtime/onert/core/src/ir/OperationValidator.cc
@@ -55,6 +55,17 @@ bool OperationValidator::isSameType(const OperandIndex &idx1, const OperandIndex
   return operandType(idx1) == operandType(idx2);
 }
 
+bool OperationValidator::isSameQuantParam(const OperandIndex &idx1, const OperandIndex &idx2)
+{
+  if (_operands.at(idx1).typeInfo().scale() != _operands.at(idx2).typeInfo().scale())
+    return false;
+
+  if (_operands.at(idx1).typeInfo().offset() != _operands.at(idx2).typeInfo().offset())
+    return false;
+
+  return true;
+}
+
 bool OperationValidator::isValidType(const OperandIndex &idx, const DataType &type)
 {
   return operandType(idx) == type;
@@ -76,29 +87,54 @@ bool OperationValidator::isValidType(const OperandIndex &idx,
 
 void OperationValidator::visit(const operation::AddN &node)
 {
+  const auto output_index(node.getOutputs().at(0));
+
   int size = node.getInputs().size();
   for (int i = 0; i < size; i++)
   {
     const auto input_index(node.getInputs().at(i));
     OP_REQUIRES(isValidType(input_index, {DataType::FLOAT32, DataType::INT32}));
+    OP_REQUIRES(isSameType(input_index, output_index));
   }
 }
 
+void OperationValidator::visit(const operation::ArgMinMax &node)
+{
+  const auto input_index(node.getInputs().at(operation::ArgMinMax::Input::INPUT));
+  const auto axis_index(node.getInputs().at(operation::ArgMinMax::Input::AXIS));
+  const auto output_index(node.getOutputs().at(0));
+  const auto output_type = node.param().output_type;
+
+  OP_REQUIRES(isValidType(input_index, {DataType::FLOAT32, DataType::INT32, DataType::UINT8,
+                                        DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
+  OP_REQUIRES(isValidType(axis_index, {DataType::INT32, DataType::INT64}));
+  OP_REQUIRES(isValidType(output_index, {DataType::INT32, DataType::INT64}));
+  OP_REQUIRES(isValidType(output_index, output_type));
+}
+
 void OperationValidator::visit(const operation::BatchMatMul &node)
 {
   const auto lhs_index(node.getInputs().at(operation::BatchMatMul::Input::LHS));
   const auto rhs_index(node.getInputs().at(operation::BatchMatMul::Input::RHS));
+  const auto output_index(node.getOutputs().at(0));
 
   // Constant lhs and rhs is not implemented yet
   OP_REQUIRES(!isConstant(lhs_index) && !isConstant(rhs_index));
+
+  // Allow hybrid quantization (lhs: float / rhs: qint8 / out: float)
+  OP_REQUIRES(isValidType(lhs_index, {DataType::FLOAT32, DataType::QUANT_INT8_ASYMM}));
+  OP_REQUIRES(isSameType(lhs_index, rhs_index) ||
+              ((operandType(lhs_index) == DataType::FLOAT32) &&
+               (operandType(rhs_index) == DataType::QUANT_INT8_ASYMM)));
+  OP_REQUIRES(isSameType(lhs_index, output_index));
 }
 
 void OperationValidator::visit(const operation::BatchToSpaceND &node)
 {
-  const auto block_size_index{node.getInputs().at(operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+  const auto input_index{node.getInputs().at(operation::BatchToSpaceND::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
 
-  // Non-constant block_size is not implemented yet
-  OP_REQUIRES(isConstant(block_size_index));
+  OP_REQUIRES(isSameType(input_index, output_index));
 }
 
 void OperationValidator::visit(const operation::BinaryArithmetic &node)
@@ -122,10 +158,48 @@ void OperationValidator::visit(const operation::Comparison &node)
   OP_REQUIRES(isValidType(output_index, DataType::BOOL8));
 }
 
+void OperationValidator::visit(const operation::Concat &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+
+  for (auto input_index : node.getInputs())
+  {
+    OP_REQUIRES(isSameType(input_index, output_index));
+
+    // Int8 quantization requires same scale and zero point
+    if (isValidType(output_index, DataType::QUANT_INT8_ASYMM))
+    {
+      OP_REQUIRES(isSameQuantParam(input_index, output_index));
+    }
+  }
+}
+
+void OperationValidator::visit(const operation::Conv2D &node)
+{
+  const auto input_index{node.getInputs().at(operation::Conv2D::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+
+  uint32_t stride_horizontal = node.param().stride.horizontal;
+  uint32_t stride_vertical = node.param().stride.vertical;
+  uint32_t dilation_width = node.param().dilation.width_factor;
+  uint32_t dilation_height = node.param().dilation.height_factor;
+
+  OP_REQUIRES((stride_horizontal > 0) && (stride_vertical > 0));
+  OP_REQUIRES((dilation_width > 0) && (dilation_height > 0));
+  OP_REQUIRES(isSameType(input_index, output_index));
+}
+
 void OperationValidator::visit(const operation::DepthToSpace &node)
 {
+  const auto input_index{node.getInputs().at(operation::DepthToSpace::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+
   int32_t block_size = node.param().block_size;
 
+  OP_REQUIRES(isValidType(input_index, {DataType::FLOAT32, DataType::INT32, DataType::INT64,
+                                        DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
+  OP_REQUIRES(isSameType(input_index, output_index));
+
   OP_REQUIRES(block_size > 0);
 }
 
@@ -151,6 +225,32 @@ void OperationValidator::visit(const operation::ElementwiseActivation &node)
 
   // Check if I/O types match
   OP_REQUIRES(isSameType(output_index, input_index));
+
+  switch (node.param().op_type)
+  {
+    case operation::ElementwiseActivation::Type::ELU:
+      OP_REQUIRES(isValidType(input_index, DataType::FLOAT32));
+      break;
+    case operation::ElementwiseActivation::Type::LEAKY_RELU:
+      OP_REQUIRES(
+          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+      break;
+    case operation::ElementwiseActivation::Type::LOGISTIC:
+      OP_REQUIRES(
+          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+      break;
+    case operation::ElementwiseActivation::Type::RELU:
+      OP_REQUIRES(isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                            DataType::QUANT_INT8_ASYMM}));
+      break;
+    case operation::ElementwiseActivation::Type::TANH:
+      OP_REQUIRES(
+          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+      break;
+  }
 }
 
 void OperationValidator::visit(const operation::ElementwiseBinary &node)
@@ -161,6 +261,13 @@ void OperationValidator::visit(const operation::ElementwiseBinary &node)
 
   OP_REQUIRES(isSameType(lhs_index, rhs_index));
   OP_REQUIRES(isSameType(lhs_index, output_index));
+
+  const auto op_type = node.param().op_type;
+  if (op_type == operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND ||
+      op_type == operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR)
+  {
+    OP_REQUIRES(isValidType(lhs_index, DataType::BOOL8));
+  }
 }
 
 void OperationValidator::visit(const operation::ElementwiseUnary &node)
@@ -195,8 +302,17 @@ void OperationValidator::visit(const operation::ElementwiseUnary &node)
 void OperationValidator::visit(const operation::EmbeddingLookup &node)
 {
   const auto lookups_index{node.getInputs().at(operation::EmbeddingLookup::Input::LOOKUPS)};
+  const auto values_index{node.getInputs().at(operation::EmbeddingLookup::Input::VALUES)};
+  const auto output_index{node.getOutputs().at(0)};
 
   OP_REQUIRES(isValidType(lookups_index, DataType::INT32));
+
+  // TFLite: Allow hybrid type - value table & output
+  // NNAPI: Require same value table and output type
+  OP_REQUIRES(
+      isSameType(values_index, output_index) ||
+      (isValidType(output_index, DataType::FLOAT32) &&
+       (isValidType(values_index, {DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT8_SYMM}))));
 }
 
 void OperationValidator::visit(const operation::ExpandDims &node)
@@ -206,7 +322,19 @@ void OperationValidator::visit(const operation::ExpandDims &node)
   const auto axis_index{node.getInputs().at(operation::ExpandDims::Input::AXIS)};
 
   OP_REQUIRES(isSameType(output_index, input_index));
-  OP_REQUIRES(isValidType(axis_index, DataType::INT32));
+  OP_REQUIRES(isValidType(axis_index, {DataType::INT32, DataType::INT64}));
+}
+
+void OperationValidator::visit(const operation::Fill &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(operation::Fill::Input::SHAPE)};
+  const auto value_index{node.getInputs().at(operation::Fill::Input::VALUE)};
+
+  OP_REQUIRES(isSameType(output_index, value_index));
+  OP_REQUIRES(isValidType(input_index, {DataType::INT32, DataType::INT64}));
+  OP_REQUIRES(isValidType(output_index,
+                          {DataType::FLOAT32, DataType::INT32, DataType::INT64, DataType::BOOL8}));
 }
 
 void OperationValidator::visit(const operation::HashtableLookup &node)
diff --git a/runtime/onert/core/src/ir/OperationValidator.h b/runtime/onert/core/src/ir/OperationValidator.h
index 2ea8000..5b95b16 100644
--- a/runtime/onert/core/src/ir/OperationValidator.h
+++ b/runtime/onert/core/src/ir/OperationValidator.h
@@ -44,10 +44,13 @@ public:
 
 public:
   void visit(const operation::AddN &node) override;
+  void visit(const operation::ArgMinMax &node) override;
   void visit(const operation::BatchMatMul &node) override;
   void visit(const operation::BatchToSpaceND &node) override;
   void visit(const operation::BinaryArithmetic &node) override;
   void visit(const operation::Comparison &node) override;
+  void visit(const operation::Concat &node) override;
+  void visit(const operation::Conv2D &node) override;
   void visit(const operation::DepthToSpace &node) override;
   void visit(const operation::DepthwiseConv2D &node) override;
   void visit(const operation::ElementwiseActivation &node) override;
@@ -55,6 +58,7 @@ public:
   void visit(const operation::ElementwiseUnary &node) override;
   void visit(const operation::EmbeddingLookup &node) override;
   void visit(const operation::ExpandDims &node) override;
+  void visit(const operation::Fill &node) override;
   void visit(const operation::HashtableLookup &node) override;
   void visit(const operation::Pack &node) override;
   void visit(const operation::Pad &node) override;
@@ -76,6 +80,7 @@ private:
   DataType operandType(const OperandIndex &idx);
   bool isConstant(const OperandIndex &idx);
   bool isSameType(const OperandIndex &idx1, const OperandIndex &idx2);
+  bool isSameQuantParam(const OperandIndex &idx1, const OperandIndex &idx2);
   bool isValidType(const OperandIndex &idx, const DataType &type);
   bool isValidType(const OperandIndex &idx, std::initializer_list<DataType> valid_types);
 
diff --git a/runtime/onert/core/src/ir/operation/ArgMax.cc b/runtime/onert/core/src/ir/operation/ArgMinMax.cc
similarity index 78%
rename from runtime/onert/core/src/ir/operation/ArgMax.cc
rename to runtime/onert/core/src/ir/operation/ArgMinMax.cc
index f3bd8fd..989d905 100644
--- a/runtime/onert/core/src/ir/operation/ArgMax.cc
+++ b/runtime/onert/core/src/ir/operation/ArgMinMax.cc
@@ -14,10 +14,7 @@
  * limitations under the License.
  */
 
-#include "ir/operation/ArgMax.h"
-
-#include <cassert>
-
+#include "ir/operation/ArgMinMax.h"
 #include "ir/OperationVisitor.h"
 
 namespace onert
@@ -27,10 +24,10 @@ namespace ir
 namespace operation
 {
 
-void ArgMax::accept(OperationVisitor &v) const { v.visit(*this); }
+void ArgMinMax::accept(OperationVisitor &v) const { v.visit(*this); }
 
-ArgMax::ArgMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-               const Param &param)
+ArgMinMax::ArgMinMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
+                     const Param &param)
     : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
index 6a0be7e..20b6fa1 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
@@ -57,7 +57,7 @@ std::string ElementwiseUnary::name() const
       {ElementwiseUnaryType::RSQRT, std::string{"RSqrt"}},
       {ElementwiseUnaryType::SIN, std::string{"Sin"}},
       {ElementwiseUnaryType::SQRT, std::string{"Sqrt"}},
-      {ElementwiseUnaryType::SQURE, std::string{"Squre"}},
+      {ElementwiseUnaryType::SQUARE, std::string{"Square"}},
       {ElementwiseUnaryType::ZEROS_LIKE, std::string{"ZerosLike"}}};
   return name_map.at(_param.op_type);
 }
diff --git a/runtime/onert/core/src/util/ConfigSource.cc b/runtime/onert/core/src/util/ConfigSource.cc
index 45cce66..9da93f6 100644
--- a/runtime/onert/core/src/util/ConfigSource.cc
+++ b/runtime/onert/core/src/util/ConfigSource.cc
@@ -30,8 +30,10 @@ namespace util
 {
 
 static std::unique_ptr<IConfigSource> _source;
+static std::unique_ptr<IConfigSource> _source_ext;
 
 void config_source(std::unique_ptr<IConfigSource> &&source) { _source = std::move(source); }
+void config_source_ext(std::unique_ptr<IConfigSource> &&source) { _source_ext = std::move(source); }
 
 static IConfigSource *config_source()
 {
@@ -67,6 +69,15 @@ static std::string getConfigOrDefault(const std::string &key)
   auto ret = config_source()->get(key);
   if (ret.empty())
   {
+    // if env is not set, search from external
+    if (_source_ext.get())
+    {
+      ret = _source_ext.get()->get(key);
+    }
+  }
+  // if not found search from defaults
+  if (ret.empty())
+  {
     auto itr = defaults.find(key);
     if (itr != defaults.end())
     {
diff --git a/runtime/onert/core/src/util/EventCollector.cc b/runtime/onert/core/src/util/EventCollector.cc
index de37276..fd56187 100644
--- a/runtime/onert/core/src/util/EventCollector.cc
+++ b/runtime/onert/core/src/util/EventCollector.cc
@@ -38,15 +38,17 @@ class DurationEventBuilder
 public:
   DurationEventBuilder(const std::string &ts) : _ts{ts} {}
 
-  DurationEvent build(const std::string &tid, const std::string &name, const std::string &ph) const
+  DurationEvent build(const EventCollector::Event &evt_collected, const std::string &ph) const
   {
     DurationEvent evt;
 
-    evt.name = name;
-    evt.tid = tid;
+    evt.name = evt_collected.label;
+    evt.tid = evt_collected.backend;
     evt.ph = ph;
     evt.ts = _ts;
 
+    evt.args = evt_collected.userData;
+
     return evt;
   }
 
@@ -93,11 +95,11 @@ void EventCollector::onEvent(const Event &event)
   switch (event.edge)
   {
     case Edge::BEGIN:
-      _rec->emit(DurationEventBuilder(ts).build(event.backend, event.label, "B"));
+      _rec->emit(DurationEventBuilder(ts).build(event, "B"));
       break;
 
     case Edge::END:
-      _rec->emit(DurationEventBuilder(ts).build(event.backend, event.label, "E"));
+      _rec->emit(DurationEventBuilder(ts).build(event, "E"));
       break;
   }
 
diff --git a/runtime/onert/core/src/util/EventCollector.h b/runtime/onert/core/src/util/EventCollector.h
index 8154be5..7daa485 100644
--- a/runtime/onert/core/src/util/EventCollector.h
+++ b/runtime/onert/core/src/util/EventCollector.h
@@ -19,6 +19,10 @@
 
 #include "util/EventRecorder.h"
 
+#include <vector>
+#include <utility>
+#include <string>
+
 class EventCollector
 {
 public:
@@ -31,8 +35,24 @@ public:
   struct Event
   {
     Edge edge;
+    uint32_t session_index;
+    uint32_t subg_index;
     std::string backend;
+    uint32_t op_index;
+    std::string op_name;
+    uint32_t op_seq_size; // if this event is for an operation sequence of multiple operations
+
+    // TODO deprecate this. label can be differ by writer. So let the writer decide label.
     std::string label;
+
+    // user-defined data: pairs of (key, value)
+    std::vector<std::pair<std::string, std::string>> userData;
+
+    Event(Edge a_edge, const std::string &a_backend, const std::string &a_label)
+        : edge(a_edge), session_index(0), subg_index(0), backend(a_backend), op_index(0),
+          op_seq_size(0), label(a_label)
+    { /* empty */
+    }
   };
 
 public:
diff --git a/runtime/onert/core/src/util/EventCollectorGlobal.cc b/runtime/onert/core/src/util/EventCollectorGlobal.cc
deleted file mode 100644
index 6c03a5b..0000000
--- a/runtime/onert/core/src/util/EventCollectorGlobal.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/EventCollectorGlobal.h"
-
-#include <cassert>
-#include <fstream>
-#include <iostream>
-
-#include "util/ConfigSource.h"
-#include "util/EventWriter.h"
-
-namespace onert
-{
-namespace util
-{
-
-EventCollectorGlobal::EventCollectorGlobal() : _recorder{}, _collector{&_recorder}
-{
-  // DO NOTHING
-}
-
-EventCollectorGlobal::~EventCollectorGlobal()
-{
-  if (!_recorder.empty())
-  {
-    try
-    {
-      // TODO Need better way for saved file path than the hardcoded path
-      EventWriter{_recorder}.writeToFile("trace.global.json",
-                                         EventWriter::WriteFormat::CHROME_TRACING);
-    }
-    catch (const std::exception &e)
-    {
-      std::cerr << "E: Fail to record event in EventCollectorGlobal: " << e.what() << std::endl;
-    }
-  }
-}
-
-EventCollectorGlobal &EventCollectorGlobal::get()
-{
-  static EventCollectorGlobal instance;
-  return instance;
-}
-
-EventDurationBlock::EventDurationBlock(const std::string &tag) : _tag{tag}
-{
-  auto &glob = EventCollectorGlobal::get();
-  glob.collector().onEvent(EventCollector::Event{EventCollector::Edge::BEGIN, "0", _tag});
-}
-EventDurationBlock::~EventDurationBlock()
-{
-  auto &glob = EventCollectorGlobal::get();
-  glob.collector().onEvent(EventCollector::Event{EventCollector::Edge::END, "0", _tag});
-}
-
-EventDurationManual::EventDurationManual(const std::string &tag) : _tag{tag}, _pair{true} {}
-
-EventDurationManual::~EventDurationManual()
-{
-  // Check if it has called begin-end pair
-  assert(_pair);
-}
-
-void EventDurationManual::begin()
-{
-  _pair = false;
-  auto &glob = EventCollectorGlobal::get();
-  glob.collector().onEvent(EventCollector::Event{EventCollector::Edge::BEGIN, "0", _tag});
-}
-
-void EventDurationManual::end()
-{
-  assert(!_pair);
-  _pair = true;
-  auto &glob = EventCollectorGlobal::get();
-  glob.collector().onEvent(EventCollector::Event{EventCollector::Edge::END, "0", _tag});
-}
-
-} // namespace util
-} // namespace onert
diff --git a/runtime/onert/core/src/util/EventCollectorGlobal.h b/runtime/onert/core/src/util/EventCollectorGlobal.h
deleted file mode 100644
index 1027ec8..0000000
--- a/runtime/onert/core/src/util/EventCollectorGlobal.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_UTIL_EVENT_COLLECTOR_GLOBAL_H__
-#define __ONERT_UTIL_EVENT_COLLECTOR_GLOBAL_H__
-
-#include "util/EventRecorder.h"
-#include "util/EventCollector.h"
-
-namespace onert
-{
-namespace util
-{
-
-/**
- * @brief Singleton class for event collection from anywhere in code
- *
- */
-class EventCollectorGlobal
-{
-public:
-  /**
-   * @brief Get the singleton object of this class
-   *
-   * @return EventCollectorGlobal& Singleton object
-   */
-  static EventCollectorGlobal &get();
-
-public:
-  /**
-   * @brief Getter for event collector object
-   *
-   * @return EventCollector& Collector object
-   */
-  EventCollector &collector() { return _collector; }
-
-private:
-  EventCollectorGlobal();
-  ~EventCollectorGlobal();
-
-private:
-  EventRecorder _recorder;
-  EventCollector _collector;
-};
-
-/**
- * @brief Helper class for emitting duration event which is handled automatically with ctor/dtor
- *
- */
-class EventDurationBlock
-{
-public:
-  /**
-   * @brief Raise a duration event with type of BEGIN
-   *
-   * @param tag A label for the duration event
-   */
-  EventDurationBlock(const std::string &tag);
-  /**
-   * @brief Raise a duration event with type of END
-   *
-   */
-  ~EventDurationBlock();
-
-private:
-  std::string _tag;
-};
-
-/**
- * @brief Helper class for emitting duration event which is handled manually
- *
- *        Usage:
- *        {
- *          ...
- *          EventDurationManual duration("some tag");
- *          duration.begin();
- *          ...
- *          ... // Code for duration
- *          ...
- *          duration.end();
- *        }
- *
- */
-class EventDurationManual
-{
-public:
-  /**
-   * @brief Construct a new Event Duration Manual object
-   *
-   * @param tag A label for the duration object
-   */
-  EventDurationManual(const std::string &tag);
-  /**
-   * @brief Destroy the Event Duration Manual object
-   *
-   */
-  ~EventDurationManual();
-
-  /**
-   * @brief Raise a duration event with type of BEGIN
-   *
-   */
-  void begin();
-  /**
-   * @brief Raise a duration event with type of END
-   *
-   */
-  void end();
-
-private:
-  std::string _tag;
-  bool _pair;
-};
-
-} // namespace util
-} // namespace onert
-
-/**
- * Helper Macro Definitions
- *
- * HOW TO USE
- *
- * void f(args)
- * {
- *   EVENT_DURATION_FUNCTION();
- *   ...
- *   if(cond)
- *   {
- *     EVENT_DURATION_REGION("if branch");
- *     ...
- *   }
- *   ...
- * }
- */
-
-#define EVENT_DURATION_FUNCTION() \
-  ::onert::util::EventDurationBlock __event_duration__##__LINE__ { __FUNCTION__ }
-
-#define EVENT_DURATION_REGION(tag) \
-  ::onert::util::EventDurationBlock __event_duration__##__LINE__ { tag }
-
-#endif // __ONERT_UTIL_EVENT_COLLECTOR_GLOBAL_H__
diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h
index 7af4c7d..3ed4087 100644
--- a/runtime/onert/core/src/util/EventRecorder.h
+++ b/runtime/onert/core/src/util/EventRecorder.h
@@ -27,8 +27,9 @@ struct Event
 {
   std::string name;
   std::string tid;
-  std::string ph; /* REQUIRED */
-  std::string ts; /* REQUIRED */
+  std::string ph;                                        /* REQUIRED */
+  std::string ts;                                        /* REQUIRED */
+  std::vector<std::pair<std::string, std::string>> args; // user-defined data: pairs of (key, value)
 };
 
 struct DurationEvent : public Event
diff --git a/runtime/onert/core/src/util/EventWriter.cc b/runtime/onert/core/src/util/EventWriter.cc
index dacb40e..8760a16 100644
--- a/runtime/onert/core/src/util/EventWriter.cc
+++ b/runtime/onert/core/src/util/EventWriter.cc
@@ -89,6 +89,7 @@ void fill(Content &content, const Event &evt)
   content.flds.emplace_back("tid", evt.tid);
   content.flds.emplace_back("ph", evt.ph);
   content.flds.emplace_back("ts", evt.ts);
+  content.args = evt.args;
 }
 
 std::string object(const DurationEvent &evt)
@@ -418,40 +419,7 @@ struct MDTableBuilder
 
 } // namespace
 
-EventWriter::EventWriter(const EventRecorder &recorder) : _recorder(recorder)
-{
-  // DO NOTHING
-}
-
-void EventWriter::writeToFiles(const std::string &base_filepath)
-{
-  // Note. According to an internal issue, let snpe json as just file name not '.snpe.json'
-  writeToFile(base_filepath, WriteFormat::SNPE_BENCHMARK);
-  writeToFile(base_filepath + ".chrome.json", WriteFormat::CHROME_TRACING);
-  writeToFile(base_filepath + ".table.md", WriteFormat::MD_TABLE);
-}
-
-void EventWriter::writeToFile(const std::string &filepath, WriteFormat write_format)
-{
-  std::ofstream os{filepath, std::ofstream::out};
-  switch (write_format)
-  {
-    case WriteFormat::CHROME_TRACING:
-      writeChromeTrace(os);
-      break;
-    case WriteFormat::SNPE_BENCHMARK:
-      writeSNPEBenchmark(os);
-      break;
-    case WriteFormat::MD_TABLE:
-      writeMDTable(os);
-      break;
-    default:
-      assert(!"Invalid value");
-      break;
-  }
-}
-
-void EventWriter::writeSNPEBenchmark(std::ostream &os)
+void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
 {
   Json::Value root;
   auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
@@ -475,11 +443,14 @@ void EventWriter::writeSNPEBenchmark(std::ostream &os)
   // Memory
   {
     std::unordered_map<std::string, Stat> mem_stats;
-    for (auto &evt : _recorder.counter_events())
+    for (auto &recorder : recorders)
     {
-      auto &mem_stat = mem_stats[evt.name];
-      uint64_t val = std::stoull(evt.values.at("value"));
-      mem_stat.accumulate(val);
+      for (auto &evt : recorder->counter_events())
+      {
+        auto &mem_stat = mem_stats[evt.name];
+        uint64_t val = std::stoull(evt.values.at("value"));
+        mem_stat.accumulate(val);
+      }
     }
 
     auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
@@ -501,26 +472,29 @@ void EventWriter::writeSNPEBenchmark(std::ostream &os)
     // 2D keys : stats[tid][name]
     std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
     std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
-    for (auto &evt : _recorder.duration_events())
+    for (auto &recorder : recorders)
     {
-      auto &stat = stats[evt.tid][evt.name];
-      auto &begin_ts = begin_timestamps[evt.tid][evt.name];
-      uint64_t timestamp = std::stoull(evt.ts);
-      if (evt.ph == "B")
+      for (auto &evt : recorder->duration_events())
       {
-        if (begin_ts != 0)
-          throw std::runtime_error{"Invalid Data"};
-        begin_ts = timestamp;
-      }
-      else if (evt.ph == "E")
-      {
-        if (begin_ts == 0 || timestamp < begin_ts)
-          throw std::runtime_error{"Invalid Data"};
-        stat.accumulate(timestamp - begin_ts);
-        begin_ts = 0;
+        auto &stat = stats[evt.tid][evt.name];
+        auto &begin_ts = begin_timestamps[evt.tid][evt.name];
+        uint64_t timestamp = std::stoull(evt.ts);
+        if (evt.ph == "B")
+        {
+          if (begin_ts != 0)
+            throw std::runtime_error{"Invalid Data"};
+          begin_ts = timestamp;
+        }
+        else if (evt.ph == "E")
+        {
+          if (begin_ts == 0 || timestamp < begin_ts)
+            throw std::runtime_error{"Invalid Data"};
+          stat.accumulate(timestamp - begin_ts);
+          begin_ts = 0;
+        }
+        else
+          throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
       }
-      else
-        throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
     }
 
     for (auto &kv : begin_timestamps)
@@ -545,30 +519,71 @@ void EventWriter::writeSNPEBenchmark(std::ostream &os)
     }
   }
 
-  os << root;
+  _os << root;
 }
 
-void EventWriter::writeChromeTrace(std::ostream &os)
+void ChromeTracingWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
 {
-  os << "{\n";
-  os << "  " << quote("traceEvents") << ": [\n";
+  _os << "{\n";
+  _os << "  " << quote("traceEvents") << ": [\n";
 
-  for (auto &evt : _recorder.duration_events())
+  for (auto &recorder : recorders)
   {
-    os << "    " << object(evt) << ",\n";
+    flushOneRecord(*recorder);
   }
 
-  for (auto &evt : _recorder.counter_events())
+  _os << "    { }\n";
+  _os << "  ]\n";
+  _os << "}\n";
+}
+
+void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder)
+{
+  for (auto &evt : recorder.duration_events())
   {
-    os << "    " << object(evt) << ",\n";
+    _os << "    " << object(evt) << ",\n";
   }
 
-  os << "    { }\n";
-  os << "  ]\n";
-  os << "}\n";
+  for (auto &evt : recorder.counter_events())
+  {
+    _os << "    " << object(evt) << ",\n";
+  }
 }
 
-void EventWriter::writeMDTable(std::ostream &os)
+void MDTableWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &records)
+{
+  for (auto &recorder : records)
+  {
+    MDTableBuilder(recorder->duration_events(), recorder->counter_events()).build().write(_os);
+  }
+}
+
+// initialization
+std::mutex EventWriter::_mutex;
+
+void EventWriter::readyToFlush(std::unique_ptr<EventRecorder> &&recorder)
 {
-  MDTableBuilder(_recorder.duration_events(), _recorder.counter_events()).build().write(os);
+  {
+    std::unique_lock<std::mutex> lock{_mutex};
+
+    _recorders.emplace_back(std::move(recorder));
+
+    if (--_ref_count > 0)
+      return;
+  }
+  // The caller of this method is the last instance that uses EventWriter.
+  // Let's write log files.
+
+  // Note. According to an internal issue, let snpe json as just file name not '.snpe.json'
+  flush(WriteFormat::SNPE_BENCHMARK);
+  flush(WriteFormat::CHROME_TRACING);
+  flush(WriteFormat::MD_TABLE);
+}
+
+void EventWriter::flush(WriteFormat write_format)
+{
+  auto *writer = _actual_writers[write_format].get();
+  assert(writer);
+
+  writer->flush(_recorders);
 }
diff --git a/runtime/onert/core/src/util/EventWriter.h b/runtime/onert/core/src/util/EventWriter.h
index 7e838ca..0dcd00b 100644
--- a/runtime/onert/core/src/util/EventWriter.h
+++ b/runtime/onert/core/src/util/EventWriter.h
@@ -20,7 +20,49 @@
 #include "EventRecorder.h"
 
 #include <string>
-#include <ostream>
+#include <vector>
+#include <unordered_map>
+#include <mutex>
+#include <fstream>
+
+class EventFormatWriter
+{
+public:
+  EventFormatWriter(const std::string &filepath) : _os{filepath, std::ofstream::out} {}
+  virtual ~EventFormatWriter() { /* empty */}
+
+  virtual void flush(const std::vector<std::unique_ptr<EventRecorder>> &) = 0;
+
+protected:
+  std::ofstream _os;
+};
+
+class SNPEWriter : public EventFormatWriter
+{
+public:
+  SNPEWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
+  void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
+};
+
+class ChromeTracingWriter : public EventFormatWriter
+{
+public:
+  ChromeTracingWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
+  void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
+
+private:
+  void flushOneRecord(const EventRecorder &);
+};
+
+class MDTableWriter : public EventFormatWriter
+{
+public:
+  MDTableWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
+  void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
+
+private:
+  void flushOneRecord(const EventRecorder &);
+};
 
 class EventWriter
 {
@@ -32,20 +74,58 @@ public:
     MD_TABLE,
   };
 
-public:
-  EventWriter(const EventRecorder &recorder);
+  /**
+   * @brief Retuens a singleton object
+   */
+  static EventWriter *get(const std::string &filename)
+  {
+    std::unique_lock<std::mutex> lock{_mutex};
 
-public:
-  void writeToFiles(const std::string &base_filepath);
-  void writeToFile(const std::string &filepath, WriteFormat write_format);
+    static EventWriter singleton(filename);
+    return &singleton;
+  }
+
+  /**
+   * @brief Call this when observer which use EventWriter starts
+   */
+  void startToUse()
+  {
+    std::unique_lock<std::mutex> lock{_mutex};
+    _ref_count++;
+  }
+
+  /**
+   * @brief Call this when observer which use EventWriter finishes.
+   *        After multiple observers calls this method, the reference count will eventually be 0.
+   *        Then, EventWriter will write profiling result file.
+   */
+  void readyToFlush(std::unique_ptr<EventRecorder> &&recorder);
 
 private:
-  void writeSNPEBenchmark(std::ostream &os);
-  void writeChromeTrace(std::ostream &os);
-  void writeMDTable(std::ostream &os);
+  EventWriter(const std::string &filepath) : _ref_count(0)
+  {
+    std::string snpe_log_name(filepath);
+    std::string chrome_tracing_log_name(filepath + ".chrome.json");
+    std::string md_table_log_name(filepath + ".table.md");
+
+    _actual_writers[WriteFormat::SNPE_BENCHMARK] = std::make_unique<SNPEWriter>(snpe_log_name);
+    _actual_writers[WriteFormat::CHROME_TRACING] =
+        std::make_unique<ChromeTracingWriter>(chrome_tracing_log_name);
+    _actual_writers[WriteFormat::MD_TABLE] = std::make_unique<MDTableWriter>(md_table_log_name);
+  };
+
+  void flush(WriteFormat write_format);
 
 private:
-  const EventRecorder &_recorder;
+  static std::mutex _mutex;
+
+  // number of observer of an executor that want to write profiling data
+  int32_t _ref_count;
+
+  // one recorder object per executor
+  std::vector<std::unique_ptr<EventRecorder>> _recorders;
+
+  std::unordered_map<WriteFormat, std::unique_ptr<EventFormatWriter>> _actual_writers;
 };
 
 #endif // __ONERT_UTIL_EVENT_WRITER_H__
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index 1f468a8..3ed3080 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -128,11 +128,11 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
   return broadcastShapes(lhs_shape, rhs_shape);
 }
 
-ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank)
+ir::Shape inferArgMinMaxShape(const ir::Shape &input_shape, int axis, int rank)
 {
   if (axis < 0 || axis >= rank)
   {
-    throw std::runtime_error("ArgMax shape inference: Wrong axis value " + std::to_string(axis));
+    throw std::runtime_error("ArgMinMax shape inference: Wrong axis value " + std::to_string(axis));
   }
 
   ir::Shape out_shape;
@@ -385,18 +385,22 @@ ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis)
   return out_shape;
 }
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf)
+template <typename T> ir::Shape inferFillShape(const ir::Shape &fill_shape, const T *shape_buf)
 {
-  ir::Shape out_shape(in_shape.dim(0));
+  ir::Shape out_shape(fill_shape.dim(0));
 
   for (int out_x = 0; out_x < out_shape.rank(); ++out_x)
   {
-    out_shape.dim(out_x) = in_buf[out_x];
+    out_shape.dim(out_x) = static_cast<int32_t>(shape_buf[out_x]);
   }
 
   return out_shape;
 }
 
+// template instantiation
+template ir::Shape inferFillShape(const ir::Shape &fill_shape, const int32_t *shape_buf);
+template ir::Shape inferFillShape(const ir::Shape &fill_shape, const int64_t *shape_buf);
+
 ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &ker_shape)
 {
   assert(in_shape.rank() >= 2);
diff --git a/runtime/onert/core/src/util/TracingCtx.cc b/runtime/onert/core/src/util/TracingCtx.cc
new file mode 100644
index 0000000..08a1b32
--- /dev/null
+++ b/runtime/onert/core/src/util/TracingCtx.cc
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/TracingCtx.h"
+
+namespace onert
+{
+namespace util
+{
+
+// initializing static member var
+std::mutex TracingCtx::_session_id_mutex;
+
+} // namespace util
+} // namespace onert
diff --git a/runtime/onert/frontend/.clang-format b/runtime/onert/frontend/.clang-format
new file mode 120000
index 0000000..83185fe
--- /dev/null
+++ b/runtime/onert/frontend/.clang-format
@@ -0,0 +1 @@
+../../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index c0003e4..f9c97b4 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -68,7 +68,7 @@ public:
    * @param graph reference on subgraphs
    */
   explicit BaseLoader(std::unique_ptr<ir::Subgraphs> &subgs)
-      : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr}
+    : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr}
   {
     _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA);
   }
@@ -114,23 +114,19 @@ private:
   // Operations
   template <typename OpIR, typename... Args>
   const OpIR *loadOperationTo(const Operator *op, ir::Graph &subg, Args &&... args);
-  void loadConv2D(const Operator *op, ir::Graph &subg);
-  void loadDepthwiseConv2D(const Operator *op, ir::Graph &subg);
-  void loadTransposeConv(const Operator *op, ir::Graph &subg);
-  void loadPool2D(const Operator *op, ir::Graph &subg, ir::operation::Pool2D::PoolType op_type);
-  void loadReshape(const Operator *op, ir::Graph &subg);
-  void loadSoftmax(const Operator *op, ir::Graph &subg);
-  void loadConcatenation(const Operator *op, ir::Graph &subg);
-  void loadFC(const Operator *op, ir::Graph &subg);
+
+  void loadAddV2(const Operator *op, ir::Graph &subg);
+  void loadArgMinMax(const Operator *op, ir::Graph &subg, bool is_argmax);
+  void loadBatchMatMul(const Operator *op, ir::Graph &subg);
   void loadBinaryArithmetic(const Operator *op, ir::Graph &subg,
                             ir::operation::BinaryArithmetic::ArithmeticType op_type);
-  void loadAddV2(const Operator *op, ir::Graph &subg);
-  void loadPack(const Operator *op, ir::Graph &subg);
-  void loadResizeBilinear(const Operator *op, ir::Graph &subg);
-  void loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg);
-  void loadReduce(const Operator *op, ir::Graph &subg,
-                  ir::operation::Reduce::ReduceType reduce_type);
-  void loadReduceAll(const Operator *op, ir::Graph &subg);
+  void loadComparison(const Operator *op, ir::Graph &subg);
+  void loadConcatenation(const Operator *op, ir::Graph &subg);
+  void loadConv2D(const Operator *op, ir::Graph &subg);
+  void loadCustom(const Operator *op, ir::Graph &subg);
+  void loadDepthToSpace(const Operator *op, ir::Graph &subg);
+  void loadDepthwiseConv2D(const Operator *op, ir::Graph &subg);
+  void loadEinsum(const Operator *op, ir::Graph &subg);
   void loadElementwiseActivation(const Operator *op, ir::Graph &subg,
                                  ir::operation::ElementwiseActivation::Type op_type,
                                  float alpha = 0.f, float beta = 0.f);
@@ -138,25 +134,31 @@ private:
                              ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type);
   void loadElementwiseUnary(const Operator *op, ir::Graph &subg,
                             ir::operation::ElementwiseUnary::Type op_type);
+  void loadFC(const Operator *op, ir::Graph &subg);
+  void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
   void loadGather(const Operator *op, ir::Graph &subg);
-  void loadCustom(const Operator *op, ir::Graph &subg);
-  void loadBatchMatMul(const Operator *op, ir::Graph &subg);
-  void loadSqueeze(const Operator *op, ir::Graph &subg);
+  void loadIf(const Operator *op, ir::Graph &subg);
+  void loadLeakyRelu(const Operator *op, ir::Graph &subg);
+  void loadLogSoftmax(const Operator *op, ir::Graph &subg);
+  void loadOneHot(const Operator *op, ir::Graph &subg);
+  void loadPack(const Operator *op, ir::Graph &subg);
+  void loadPool2D(const Operator *op, ir::Graph &subg, ir::operation::Pool2D::PoolType op_type);
+  void loadReduce(const Operator *op, ir::Graph &subg,
+                  ir::operation::Reduce::ReduceType reduce_type);
+  void loadReduceAll(const Operator *op, ir::Graph &subg);
+  void loadReshape(const Operator *op, ir::Graph &subg);
+  void loadResizeBilinear(const Operator *op, ir::Graph &subg);
+  void loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg);
+  void loadSoftmax(const Operator *op, ir::Graph &subg);
+  void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
   void loadSplit(const Operator *op, ir::Graph &subg);
   void loadSplitV(const Operator *op, ir::Graph &subg);
+  void loadSqueeze(const Operator *op, ir::Graph &subg);
   void loadStridedSlice(const Operator *op, ir::Graph &subg);
+  void loadTransposeConv(const Operator *op, ir::Graph &subg);
+  void loadUnidirectionalSequenceLSTM(const Operator *op, ir::Graph &subg);
   void loadUnpack(const Operator *op, ir::Graph &subg);
-  void loadComparison(const Operator *op, ir::Graph &subg);
-  void loadEinsum(const Operator *op, ir::Graph &subg);
-  void loadOneHot(const Operator *op, ir::Graph &subg);
-  void loadIf(const Operator *op, ir::Graph &subg);
   void loadWhile(const Operator *op, ir::Graph &subg);
-  void loadArgMax(const Operator *op, ir::Graph &subg);
-  void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
-  void loadLogSoftmax(const Operator *op, ir::Graph &subg);
-  void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
-  void loadLeakyRelu(const Operator *op, ir::Graph &subg);
-  void loadUnidirectionalSequenceLSTM(const Operator *op, ir::Graph &subg);
 
   void verifySubgraphIndex(int subg_index)
   {
@@ -255,19 +257,26 @@ ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const Te
   {
     case TensorType::TensorType_FLOAT32:
       return ir::DataType::FLOAT32;
+    case TensorType::TensorType_FLOAT16:
+      return ir::DataType::FLOAT16;
     case TensorType::TensorType_INT32:
       return ir::DataType::INT32;
-    case TensorType::TensorType_BOOL:
-      return ir::DataType::BOOL8;
     case TensorType::TensorType_UINT8:
       return ir::DataType::QUANT_UINT8_ASYMM;
-    case TensorType::TensorType_INT8:
-      return ir::DataType::QUANT_INT8_ASYMM;
     case TensorType::TensorType_INT64:
       return ir::DataType::INT64;
+    // case TensorType::TensorType_STRING:
+    case TensorType::TensorType_BOOL:
+      return ir::DataType::BOOL8;
+    case TensorType::TensorType_INT16:
+      return ir::DataType::QUANT_INT16_ASYMM;
+    // case TensorType::TensorType_COMPLEX64
+    case TensorType::TensorType_INT8:
+      return ir::DataType::QUANT_INT8_ASYMM;
+    // case TensorType::TensorType_FLOAT64
     default:
       throw std::runtime_error(
-          std::string("Unsupported tensor type: ").append(EnumNameTensorType(type)));
+        std::string("Unsupported tensor type: ").append(EnumNameTensorType(type)));
   }
 }
 
@@ -385,7 +394,7 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
       {
         size_t offset = unaligned_offset_start - aligned_offset_start;
         uint8_t *mmap_base = static_cast<uint8_t *>(
-            mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start));
+          mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start));
         data_obj = std::make_unique<ir::CachedData>(mmap_base + offset, data_size);
         munmap(mmap_base, mmap_size);
       }
@@ -446,7 +455,7 @@ void BaseLoader<LoaderDomain>::loadSparsity(const Tensor *tensor, const ir::Shap
     bool block2D_sparsity = dim_metadata_size == 4 && block_rank == 2;
     if (dim_metadata_size != !random_sparsity && !block2D_sparsity)
       throw std::runtime_error(
-          "sparsity is supported only for 2D tensor with random or 16x1 block sparsity.");
+        "sparsity is supported only for 2D tensor with random or 16x1 block sparsity.");
 
     const auto *src_metadata = src_sparsity->dim_metadata()->Get(0);
     if (src_metadata->format() != DimensionType::DimensionType_DENSE)
@@ -514,8 +523,8 @@ void BaseLoader<LoaderDomain>::loadOperationIO(const Operator *op, ir::OperandIn
       auto builtin_code = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
       if (isOptionalInputTensor(idx) && !allowOptionalInputTensor(builtin_code))
         throw std::runtime_error(
-            std::string("loader doesn't support optional input tensor yet for ")
-                .append(EnumNameBuiltinOperator(builtin_code)));
+          std::string("loader doesn't support optional input tensor yet for ")
+            .append(EnumNameBuiltinOperator(builtin_code)));
     };
     check_optional_input();
     inputs.append(tensorIdxToOperandIdx(idx));
@@ -691,9 +700,9 @@ void BaseLoader<LoaderDomain>::loadFC(const Operator *op, ir::Graph &subg)
   const auto fc = loadOperationTo<ir::operation::FullyConnected>(op, subg, param);
 
   const auto &input_operand =
-      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::INPUT));
+    subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::INPUT));
   auto &weights_operand =
-      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::WEIGHT));
+    subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::WEIGHT));
   if (input_operand.typeInfo().type() == ir::DataType::FLOAT32 &&
       ((weights_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM) ||
        weights_operand.typeInfo().type() == ir::DataType::QUANT_INT8_ASYMM))
@@ -719,7 +728,7 @@ void BaseLoader<LoaderDomain>::loadAddV2(const Operator *op, ir::Graph &subg)
     auto data_root = flexbuffers::GetRoot(custom_op_data, custom_op_data_size);
     auto attr_map = data_root.AsMap();
     const auto fused_activation_func = static_cast<typename LoaderDomain::ActivationFunctionType>(
-        attr_map["fused_activation_function"].AsInt8());
+      attr_map["fused_activation_function"].AsInt8());
     param.activation = convertActivation(fused_activation_func);
   }
 
@@ -727,8 +736,18 @@ void BaseLoader<LoaderDomain>::loadAddV2(const Operator *op, ir::Graph &subg)
 }
 
 template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadDepthToSpace(const Operator *op, ir::Graph &subg)
+{
+  ir::operation::DepthToSpace::Param param;
+  const auto *options = op->builtin_options_as_DepthToSpaceOptions();
+  param.block_size = options->block_size();
+
+  loadOperationTo<ir::operation::DepthToSpace>(op, subg, param);
+}
+
+template <typename LoaderDomain>
 void BaseLoader<LoaderDomain>::loadBinaryArithmetic(
-    const Operator *op, ir::Graph &subg, ir::operation::BinaryArithmetic::ArithmeticType op_type)
+  const Operator *op, ir::Graph &subg, ir::operation::BinaryArithmetic::ArithmeticType op_type)
 {
   ir::operation::BinaryArithmetic::Param param;
   param.arithmetic_type = op_type;
@@ -780,8 +799,8 @@ void BaseLoader<LoaderDomain>::loadPack(const Operator *op, ir::Graph &subg)
 
 template <typename LoaderDomain>
 void BaseLoader<LoaderDomain>::loadElementwiseActivation(
-    const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type,
-    float alpha, float beta)
+  const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type,
+  float alpha, float beta)
 {
   ir::operation::ElementwiseActivation::Param param;
   param.op_type = op_type;
@@ -844,8 +863,8 @@ void BaseLoader<LoaderDomain>::loadReduceAll(const Operator *op, ir::Graph &subg
 
 template <typename LoaderDomain>
 void BaseLoader<LoaderDomain>::loadElementwiseBinary(
-    const Operator *op, ir::Graph &subg,
-    ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
+  const Operator *op, ir::Graph &subg,
+  ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
 {
   ir::operation::ElementwiseBinary::Param param;
   param.op_type = op_type;
@@ -870,7 +889,7 @@ void BaseLoader<LoaderDomain>::loadElementwiseUnary(const Operator *op, ir::Grap
       }
     };
     qasymm8ToUint8(
-        subg.operands().at(eu->getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)));
+      subg.operands().at(eu->getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)));
     qasymm8ToUint8(subg.operands().at(eu->getOutputs().at(0)));
   }
 }
@@ -915,8 +934,8 @@ void BaseLoader<LoaderDomain>::loadBatchMatMul(const Operator *op, ir::Graph &su
       break;
     default:
       throw std::runtime_error(
-          std::string("Wrong loaded operation: ").append(EnumNameBuiltinOperator(builtin_op)) +
-          " as " + EnumNameBuiltinOperator(BuiltinOperator::BuiltinOperator_BATCH_MATMUL));
+        std::string("Wrong loaded operation: ").append(EnumNameBuiltinOperator(builtin_op)) +
+        " as " + EnumNameBuiltinOperator(BuiltinOperator::BuiltinOperator_BATCH_MATMUL));
   }
 
   loadOperationTo<ir::operation::BatchMatMul>(op, subg, param);
@@ -959,15 +978,15 @@ void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
 
   // Mapping from custom op name string to BuiltinOP enum
   std::map<std::string, BuiltinOP> builtin_map = {
-      {"AddV2", BuiltinOP::AddV2},
-      {"All", BuiltinOP::ReduceAll},
-      {"MatrixBandPart", BuiltinOP::MatrixBandPart},
-      {"BatchMatMulV2", BuiltinOP::BatchMatMul},
-      {"Einsum", BuiltinOP::Einsum},
-      {"FusedBatchNormV3", BuiltinOP::FusedBatchNorm},
-      {"BroadcastTo", BuiltinOP::BroadcastTo},
-      {"StatelessRandomUniform", BuiltinOP::StatelessRandomUniform},
-      {"Erf", BuiltinOP::Erf},
+    {"AddV2", BuiltinOP::AddV2},
+    {"All", BuiltinOP::ReduceAll},
+    {"MatrixBandPart", BuiltinOP::MatrixBandPart},
+    {"BatchMatMulV2", BuiltinOP::BatchMatMul},
+    {"Einsum", BuiltinOP::Einsum},
+    {"FusedBatchNormV3", BuiltinOP::FusedBatchNorm},
+    {"BroadcastTo", BuiltinOP::BroadcastTo},
+    {"StatelessRandomUniform", BuiltinOP::StatelessRandomUniform},
+    {"Erf", BuiltinOP::Erf},
   };
 
   try
@@ -1005,7 +1024,7 @@ void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
         break;
       default:
         throw std::runtime_error{
-            "Loader: Custom OP map is defined but operation loader function is not defined"};
+          "Loader: Custom OP map is defined but operation loader function is not defined"};
     }
 
     return;
@@ -1120,7 +1139,7 @@ void BaseLoader<LoaderDomain>::loadComparison(const Operator *op, ir::Graph &sub
       break;
     default:
       throw std::runtime_error(
-          std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
+        std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
   }
 
   loadOperationTo<ir::operation::Comparison>(op, subg, param);
@@ -1224,25 +1243,15 @@ void BaseLoader<LoaderDomain>::loadWhile(const Operator *op, ir::Graph &subg)
 }
 
 template <typename LoaderDomain>
-void BaseLoader<LoaderDomain>::loadArgMax(const Operator *op, ir::Graph &subg)
+void BaseLoader<LoaderDomain>::loadArgMinMax(const Operator *op, ir::Graph &subg, bool is_argmax)
 {
-  ir::operation::ArgMax::Param param;
-  const auto output_type = op->builtin_options_as_ArgMaxOptions()->output_type();
-  switch (output_type)
-  {
-    case TensorType::TensorType_INT32:
-    case TensorType::TensorType_INT64:
-      param.output_type = tensorTypeToDataType(output_type);
-      break;
-    default:
-      throw std::runtime_error("ArgMax: `output_type` must be either int32 or int64.");
-  }
-  auto am = loadOperationTo<ir::operation::ArgMax>(op, subg, param);
+  ir::operation::ArgMinMax::Param param;
+  const auto output_type = is_argmax ? op->builtin_options_as_ArgMaxOptions()->output_type()
+                                     : op->builtin_options_as_ArgMinOptions()->output_type();
+  param.output_type = tensorTypeToDataType(output_type);
+  param.is_arg_max = is_argmax;
 
-  auto &axisOperand = subg.operands().at(am->getInputs().at(ir::operation::ArgMax::Input::AXIS));
-  if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 ||
-                                           axisOperand.typeInfo().type() == ir::DataType::INT64)))
-    throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported.");
+  loadOperationTo<ir::operation::ArgMinMax>(op, subg, param);
 }
 
 template <typename LoaderDomain>
@@ -1287,7 +1296,7 @@ void BaseLoader<LoaderDomain>::loadUnidirectionalSequenceLSTM(const Operator *op
   {
     auto builtin_code = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
     throw std::runtime_error(std::string("loader doesn't support optional output tensor yet for ")
-                                 .append(EnumNameBuiltinOperator(builtin_code)));
+                               .append(EnumNameBuiltinOperator(builtin_code)));
   }
   for (size_t i = 0; i < ir::operation::LSTM::Output::OUTPUT; ++i)
   {
@@ -1355,6 +1364,9 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
     case BuiltinOperator::BuiltinOperator_PACK:
       loadPack(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_ELU:
+      loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::ELU);
+      return;
     case BuiltinOperator::BuiltinOperator_RELU:
       loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::RELU,
                                 ir::operation::ElementwiseActivation::infinity, 0.f);
@@ -1383,6 +1395,9 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
     case BuiltinOperator::BuiltinOperator_SQRT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SQRT);
       return;
+    case BuiltinOperator::BuiltinOperator_SQUARE:
+      loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SQUARE);
+      return;
     case BuiltinOperator::BuiltinOperator_SQUARED_DIFFERENCE:
       loadOperationTo<ir::operation::SquaredDifference>(op, subg);
       return;
@@ -1499,7 +1514,10 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::NEG);
       return;
     case BuiltinOperator::BuiltinOperator_ARG_MAX:
-      loadArgMax(op, subg);
+      loadArgMinMax(op, subg, true);
+      return;
+    case BuiltinOperator::BuiltinOperator_ARG_MIN:
+      loadArgMinMax(op, subg, false);
       return;
     case BuiltinOperator::BuiltinOperator_LOG:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::LOG);
@@ -1513,6 +1531,10 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
     case BuiltinOperator::BuiltinOperator_LOGICAL_NOT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::LOGICAL_NOT);
       return;
+    case BuiltinOperator::BuiltinOperator_LOGICAL_AND:
+      loadElementwiseBinary(op, subg,
+                            ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND);
+      return;
     case BuiltinOperator::BuiltinOperator_LOGICAL_OR:
       loadElementwiseBinary(op, subg,
                             ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR);
@@ -1556,9 +1578,12 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
     case BuiltinOperator::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       loadUnidirectionalSequenceLSTM(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_DEPTH_TO_SPACE:
+      loadDepthToSpace(op, subg);
+      return;
     default:
       throw std::runtime_error(
-          std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
+        std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
   }
 }
 
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 33e1709..0d7b3ea 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -196,7 +196,7 @@ void CircleLoader::loadBCQFullyConnected(const Operator *op, ir::Graph &subg)
   param.activation = convertActivation(options->fused_activation_function());
 
   std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BCQFullyConnected(inputs, outputs, param));
+    new ir::operation::BCQFullyConnected(inputs, outputs, param));
   subg.addOperation(std::move(new_op));
 }
 
diff --git a/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h b/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h
index 0ff1f72..eb17752 100644
--- a/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h
+++ b/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h
@@ -2155,9 +2155,8 @@ enum ActivationFunctionType
 inline const ActivationFunctionType (&EnumValuesActivationFunctionType())[6]
 {
   static const ActivationFunctionType values[] = {
-      ActivationFunctionType_NONE,         ActivationFunctionType_RELU,
-      ActivationFunctionType_RELU_N1_TO_1, ActivationFunctionType_RELU6,
-      ActivationFunctionType_TANH,         ActivationFunctionType_SIGN_BIT};
+    ActivationFunctionType_NONE,  ActivationFunctionType_RELU, ActivationFunctionType_RELU_N1_TO_1,
+    ActivationFunctionType_RELU6, ActivationFunctionType_TANH, ActivationFunctionType_SIGN_BIT};
   return values;
 }
 
@@ -2218,9 +2217,8 @@ enum FullyConnectedOptionsWeightsFormat
 inline const FullyConnectedOptionsWeightsFormat (&EnumValuesFullyConnectedOptionsWeightsFormat())[3]
 {
   static const FullyConnectedOptionsWeightsFormat values[] = {
-      FullyConnectedOptionsWeightsFormat_DEFAULT,
-      FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8,
-      FullyConnectedOptionsWeightsFormat_SHUFFLED16x1FLOAT32};
+    FullyConnectedOptionsWeightsFormat_DEFAULT, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8,
+    FullyConnectedOptionsWeightsFormat_SHUFFLED16x1FLOAT32};
   return values;
 }
 
@@ -2478,8 +2476,8 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const circle::CustomQuantization *details_as_CustomQuantization() const
   {
     return details_type() == circle::QuantizationDetails_CustomQuantization
-               ? static_cast<const circle::CustomQuantization *>(details())
-               : nullptr;
+             ? static_cast<const circle::CustomQuantization *>(details())
+             : nullptr;
   }
   int32_t quantized_dimension() const { return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const
@@ -2551,12 +2549,12 @@ struct QuantizationParametersBuilder
 };
 
 inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
-    flatbuffers::FlatBufferBuilder &_fbb, flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
-    circle::QuantizationDetails details_type = circle::QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
+  flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
+  flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+  circle::QuantizationDetails details_type = circle::QuantizationDetails_NONE,
+  flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
 {
   QuantizationParametersBuilder builder_(_fbb);
   builder_.add_quantized_dimension(quantized_dimension);
@@ -2570,11 +2568,11 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
 }
 
 inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<float> *min = nullptr,
-    const std::vector<float> *max = nullptr, const std::vector<float> *scale = nullptr,
-    const std::vector<int64_t> *zero_point = nullptr,
-    circle::QuantizationDetails details_type = circle::QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<float> *min = nullptr,
+  const std::vector<float> *max = nullptr, const std::vector<float> *scale = nullptr,
+  const std::vector<int64_t> *zero_point = nullptr,
+  circle::QuantizationDetails details_type = circle::QuantizationDetails_NONE,
+  flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
 {
   auto min__ = min ? _fbb.CreateVector<float>(*min) : 0;
   auto max__ = max ? _fbb.CreateVector<float>(*max) : 0;
@@ -2789,20 +2787,20 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const circle::Int32Vector *array_segments_as_Int32Vector() const
   {
     return array_segments_type() == circle::SparseIndexVector_Int32Vector
-               ? static_cast<const circle::Int32Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const circle::Int32Vector *>(array_segments())
+             : nullptr;
   }
   const circle::Uint16Vector *array_segments_as_Uint16Vector() const
   {
     return array_segments_type() == circle::SparseIndexVector_Uint16Vector
-               ? static_cast<const circle::Uint16Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const circle::Uint16Vector *>(array_segments())
+             : nullptr;
   }
   const circle::Uint8Vector *array_segments_as_Uint8Vector() const
   {
     return array_segments_type() == circle::SparseIndexVector_Uint8Vector
-               ? static_cast<const circle::Uint8Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const circle::Uint8Vector *>(array_segments())
+             : nullptr;
   }
   circle::SparseIndexVector array_indices_type() const
   {
@@ -2813,20 +2811,20 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const circle::Int32Vector *array_indices_as_Int32Vector() const
   {
     return array_indices_type() == circle::SparseIndexVector_Int32Vector
-               ? static_cast<const circle::Int32Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const circle::Int32Vector *>(array_indices())
+             : nullptr;
   }
   const circle::Uint16Vector *array_indices_as_Uint16Vector() const
   {
     return array_indices_type() == circle::SparseIndexVector_Uint16Vector
-               ? static_cast<const circle::Uint16Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const circle::Uint16Vector *>(array_indices())
+             : nullptr;
   }
   const circle::Uint8Vector *array_indices_as_Uint8Vector() const
   {
     return array_indices_type() == circle::SparseIndexVector_Uint8Vector
-               ? static_cast<const circle::Uint8Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const circle::Uint8Vector *>(array_indices())
+             : nullptr;
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -2924,12 +2922,12 @@ struct DimensionMetadataBuilder
 };
 
 inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::DimensionType format = circle::DimensionType_DENSE, int32_t dense_size = 0,
-    circle::SparseIndexVector array_segments_type = circle::SparseIndexVector_NONE,
-    flatbuffers::Offset<void> array_segments = 0,
-    circle::SparseIndexVector array_indices_type = circle::SparseIndexVector_NONE,
-    flatbuffers::Offset<void> array_indices = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, circle::DimensionType format = circle::DimensionType_DENSE,
+  int32_t dense_size = 0,
+  circle::SparseIndexVector array_segments_type = circle::SparseIndexVector_NONE,
+  flatbuffers::Offset<void> array_segments = 0,
+  circle::SparseIndexVector array_indices_type = circle::SparseIndexVector_NONE,
+  flatbuffers::Offset<void> array_indices = 0)
 {
   DimensionMetadataBuilder builder_(_fbb);
   builder_.add_array_indices(array_indices);
@@ -2961,7 +2959,7 @@ struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>> *dim_metadata() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>> *>(
-        VT_DIM_METADATA);
+      VT_DIM_METADATA);
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -2987,8 +2985,8 @@ struct SparsityParametersBuilder
     fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
   }
   void add_dim_metadata(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
-          dim_metadata)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
+      dim_metadata)
   {
     fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
   }
@@ -3006,11 +3004,11 @@ struct SparsityParametersBuilder
 };
 
 inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
-        dim_metadata = 0)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
+    dim_metadata = 0)
 {
   SparsityParametersBuilder builder_(_fbb);
   builder_.add_dim_metadata(dim_metadata);
@@ -3020,16 +3018,15 @@ inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
 }
 
 inline flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *traversal_order = nullptr,
-    const std::vector<int32_t> *block_map = nullptr,
-    const std::vector<flatbuffers::Offset<circle::DimensionMetadata>> *dim_metadata = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *traversal_order = nullptr,
+  const std::vector<int32_t> *block_map = nullptr,
+  const std::vector<flatbuffers::Offset<circle::DimensionMetadata>> *dim_metadata = nullptr)
 {
   auto traversal_order__ = traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0;
   auto block_map__ = block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0;
   auto dim_metadata__ =
-      dim_metadata
-          ? _fbb.CreateVector<flatbuffers::Offset<circle::DimensionMetadata>>(*dim_metadata)
-          : 0;
+    dim_metadata ? _fbb.CreateVector<flatbuffers::Offset<circle::DimensionMetadata>>(*dim_metadata)
+                 : 0;
   return circle::CreateSparsityParameters(_fbb, traversal_order__, block_map__, dim_metadata__);
 }
 
@@ -3155,12 +3152,11 @@ CreateTensor(flatbuffers::FlatBufferBuilder &_fbb,
 }
 
 inline flatbuffers::Offset<Tensor> CreateTensorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *shape = nullptr,
-    circle::TensorType type = circle::TensorType_FLOAT32, uint32_t buffer = 0,
-    const char *name = nullptr,
-    flatbuffers::Offset<circle::QuantizationParameters> quantization = 0, bool is_variable = false,
-    flatbuffers::Offset<circle::SparsityParameters> sparsity = 0,
-    const std::vector<int32_t> *shape_signature = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *shape = nullptr,
+  circle::TensorType type = circle::TensorType_FLOAT32, uint32_t buffer = 0,
+  const char *name = nullptr, flatbuffers::Offset<circle::QuantizationParameters> quantization = 0,
+  bool is_variable = false, flatbuffers::Offset<circle::SparsityParameters> sparsity = 0,
+  const std::vector<int32_t> *shape_signature = nullptr)
 {
   auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -3190,7 +3186,7 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   int32_t dilation_w_factor() const { return GetField<int32_t>(VT_DILATION_W_FACTOR, 1); }
   int32_t dilation_h_factor() const { return GetField<int32_t>(VT_DILATION_H_FACTOR, 1); }
@@ -3249,10 +3245,10 @@ struct Conv2DOptionsBuilder
 };
 
 inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
-    int32_t stride_w = 0, int32_t stride_h = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
+  flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
+  int32_t stride_w = 0, int32_t stride_h = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
 {
   Conv2DOptionsBuilder builder_(_fbb);
   builder_.add_dilation_h_factor(dilation_h_factor);
@@ -3287,7 +3283,7 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -3344,9 +3340,9 @@ struct Pool2DOptionsBuilder
 };
 
 inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
-    int32_t stride_w = 0, int32_t stride_h = 0, int32_t filter_width = 0, int32_t filter_height = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
+  int32_t stride_w = 0, int32_t stride_h = 0, int32_t filter_width = 0, int32_t filter_height = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   Pool2DOptionsBuilder builder_(_fbb);
   builder_.add_filter_height(filter_height);
@@ -3381,7 +3377,7 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   int32_t dilation_w_factor() const { return GetField<int32_t>(VT_DILATION_W_FACTOR, 1); }
   int32_t dilation_h_factor() const { return GetField<int32_t>(VT_DILATION_H_FACTOR, 1); }
@@ -3445,10 +3441,10 @@ struct DepthwiseConv2DOptionsBuilder
 };
 
 inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
-    int32_t stride_w = 0, int32_t stride_h = 0, int32_t depth_multiplier = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
+  flatbuffers::FlatBufferBuilder &_fbb, circle::Padding padding = circle::Padding_SAME,
+  int32_t stride_w = 0, int32_t stride_h = 0, int32_t depth_multiplier = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
 {
   DepthwiseConv2DOptionsBuilder builder_(_fbb);
   builder_.add_dilation_h_factor(dilation_h_factor);
@@ -3499,12 +3495,12 @@ struct ConcatEmbeddingsOptionsBuilder
     fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
   }
   void add_num_columns_per_channel(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel)
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel)
   {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
   }
   void add_embedding_dim_per_channel(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel)
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel)
   {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL,
                    embedding_dim_per_channel);
@@ -3523,9 +3519,9 @@ struct ConcatEmbeddingsOptionsBuilder
 };
 
 inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0)
 {
   ConcatEmbeddingsOptionsBuilder builder_(_fbb);
   builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
@@ -3540,9 +3536,9 @@ CreateConcatEmbeddingsOptionsDirect(flatbuffers::FlatBufferBuilder &_fbb, int32_
                                     const std::vector<int32_t> *embedding_dim_per_channel = nullptr)
 {
   auto num_columns_per_channel__ =
-      num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0;
+    num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0;
   auto embedding_dim_per_channel__ =
-      embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0;
+    embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0;
   return circle::CreateConcatEmbeddingsOptions(_fbb, num_channels, num_columns_per_channel__,
                                                embedding_dim_per_channel__);
 }
@@ -3609,7 +3605,7 @@ struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool asymmetric_quantize_inputs() const
   {
@@ -3653,9 +3649,9 @@ struct SVDFOptionsBuilder
 };
 
 inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t rank = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t rank = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  bool asymmetric_quantize_inputs = false)
 {
   SVDFOptionsBuilder builder_(_fbb);
   builder_.add_rank(rank);
@@ -3675,7 +3671,7 @@ struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool asymmetric_quantize_inputs() const
   {
@@ -3718,9 +3714,9 @@ struct RNNOptionsBuilder
 };
 
 inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  bool asymmetric_quantize_inputs = false)
 {
   RNNOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3741,7 +3737,7 @@ struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool asymmetric_quantize_inputs() const
   {
@@ -3789,9 +3785,9 @@ struct SequenceRNNOptionsBuilder
 };
 
 inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  bool asymmetric_quantize_inputs = false)
 {
   SequenceRNNOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3814,7 +3810,7 @@ struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuf
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool merge_outputs() const { return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0; }
   bool asymmetric_quantize_inputs() const
@@ -3869,9 +3865,9 @@ struct BidirectionalSequenceRNNOptionsBuilder
 };
 
 inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    bool merge_outputs = false, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  bool merge_outputs = false, bool asymmetric_quantize_inputs = false)
 {
   BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3894,12 +3890,12 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   circle::FullyConnectedOptionsWeightsFormat weights_format() const
   {
     return static_cast<circle::FullyConnectedOptionsWeightsFormat>(
-        GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
+      GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
   }
   bool keep_num_dims() const { return GetField<uint8_t>(VT_KEEP_NUM_DIMS, 0) != 0; }
   bool asymmetric_quantize_inputs() const
@@ -3955,11 +3951,11 @@ struct FullyConnectedOptionsBuilder
 };
 
 inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    circle::FullyConnectedOptionsWeightsFormat weights_format =
-        circle::FullyConnectedOptionsWeightsFormat_DEFAULT,
-    bool keep_num_dims = false, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  circle::FullyConnectedOptionsWeightsFormat weights_format =
+    circle::FullyConnectedOptionsWeightsFormat_DEFAULT,
+  bool keep_num_dims = false, bool asymmetric_quantize_inputs = false)
 {
   FullyConnectedOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -4023,7 +4019,7 @@ struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -4057,8 +4053,8 @@ struct ConcatenationOptionsBuilder
 };
 
 inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   ConcatenationOptionsBuilder builder_(_fbb);
   builder_.add_axis(axis);
@@ -4076,7 +4072,7 @@ struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -4109,8 +4105,8 @@ struct AddOptionsBuilder
 };
 
 inline flatbuffers::Offset<AddOptions> CreateAddOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   AddOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -4127,7 +4123,7 @@ struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -4160,8 +4156,8 @@ struct MulOptionsBuilder
 };
 
 inline flatbuffers::Offset<MulOptions> CreateMulOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   MulOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -4178,7 +4174,7 @@ struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -4211,8 +4207,8 @@ struct L2NormOptionsBuilder
 };
 
 inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   L2NormOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -4263,7 +4259,7 @@ struct LocalResponseNormalizationOptionsBuilder
     fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
   }
   explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -4303,7 +4299,7 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
   float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
@@ -4367,11 +4363,11 @@ struct LSTMOptionsBuilder
 };
 
 inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    float cell_clip = 0.0f, float proj_clip = 0.0f,
-    circle::LSTMKernelType kernel_type = circle::LSTMKernelType_FULL,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  float cell_clip = 0.0f, float proj_clip = 0.0f,
+  circle::LSTMKernelType kernel_type = circle::LSTMKernelType_FULL,
+  bool asymmetric_quantize_inputs = false)
 {
   LSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
@@ -4396,7 +4392,7 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
   float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
@@ -4445,7 +4441,7 @@ struct UnidirectionalSequenceLSTMOptionsBuilder
                              static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
   explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -4461,10 +4457,10 @@ struct UnidirectionalSequenceLSTMOptionsBuilder
 
 inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>
 CreateUnidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    float cell_clip = 0.0f, float proj_clip = 0.0f, bool time_major = false,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  float cell_clip = 0.0f, float proj_clip = 0.0f, bool time_major = false,
+  bool asymmetric_quantize_inputs = false)
 {
   UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
@@ -4490,7 +4486,7 @@ struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbu
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
   float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
@@ -4546,7 +4542,7 @@ struct BidirectionalSequenceLSTMOptionsBuilder
                              static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
   explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -4561,10 +4557,10 @@ struct BidirectionalSequenceLSTMOptionsBuilder
 };
 
 inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
-    float cell_clip = 0.0f, float proj_clip = 0.0f, bool merge_outputs = false,
-    bool time_major = true, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE,
+  float cell_clip = 0.0f, float proj_clip = 0.0f, bool merge_outputs = false,
+  bool time_major = true, bool asymmetric_quantize_inputs = false)
 {
   BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
@@ -5075,7 +5071,7 @@ struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -5108,8 +5104,8 @@ struct SubOptionsBuilder
 };
 
 inline flatbuffers::Offset<SubOptions> CreateSubOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   SubOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -5126,7 +5122,7 @@ struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -5159,8 +5155,8 @@ struct DivOptionsBuilder
 };
 
 inline flatbuffers::Offset<DivOptions> CreateDivOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   DivOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -7976,7 +7972,7 @@ struct BCQFullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::T
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -8014,8 +8010,8 @@ struct BCQFullyConnectedOptionsBuilder
 };
 
 inline flatbuffers::Offset<BCQFullyConnectedOptions> CreateBCQFullyConnectedOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t weights_hidden_size = 0,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t weights_hidden_size = 0,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   BCQFullyConnectedOptionsBuilder builder_(_fbb);
   builder_.add_weights_hidden_size(weights_hidden_size);
@@ -8035,7 +8031,7 @@ struct InstanceNormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   circle::ActivationFunctionType fused_activation_function() const
   {
     return static_cast<circle::ActivationFunctionType>(
-        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+      GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -8072,8 +8068,8 @@ struct InstanceNormOptionsBuilder
 };
 
 inline flatbuffers::Offset<InstanceNormOptions> CreateInstanceNormOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, float epsilon = 0.0f,
-    circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb, float epsilon = 0.0f,
+  circle::ActivationFunctionType fused_activation_function = circle::ActivationFunctionType_NONE)
 {
   InstanceNormOptionsBuilder builder_(_fbb);
   builder_.add_epsilon(epsilon);
@@ -8191,632 +8187,632 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const circle::Conv2DOptions *builtin_options_as_Conv2DOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_Conv2DOptions
-               ? static_cast<const circle::Conv2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::Conv2DOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_DepthwiseConv2DOptions
-               ? static_cast<const circle::DepthwiseConv2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::DepthwiseConv2DOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ConcatEmbeddingsOptions
-               ? static_cast<const circle::ConcatEmbeddingsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ConcatEmbeddingsOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LSHProjectionOptions
-               ? static_cast<const circle::LSHProjectionOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LSHProjectionOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::Pool2DOptions *builtin_options_as_Pool2DOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_Pool2DOptions
-               ? static_cast<const circle::Pool2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::Pool2DOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SVDFOptions *builtin_options_as_SVDFOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SVDFOptions
-               ? static_cast<const circle::SVDFOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SVDFOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::RNNOptions *builtin_options_as_RNNOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_RNNOptions
-               ? static_cast<const circle::RNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::RNNOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_FullyConnectedOptions
-               ? static_cast<const circle::FullyConnectedOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::FullyConnectedOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SoftmaxOptions *builtin_options_as_SoftmaxOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SoftmaxOptions
-               ? static_cast<const circle::SoftmaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SoftmaxOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ConcatenationOptions *builtin_options_as_ConcatenationOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ConcatenationOptions
-               ? static_cast<const circle::ConcatenationOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ConcatenationOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::AddOptions *builtin_options_as_AddOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_AddOptions
-               ? static_cast<const circle::AddOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::AddOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::L2NormOptions *builtin_options_as_L2NormOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_L2NormOptions
-               ? static_cast<const circle::L2NormOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::L2NormOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LocalResponseNormalizationOptions *
   builtin_options_as_LocalResponseNormalizationOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LocalResponseNormalizationOptions
-               ? static_cast<const circle::LocalResponseNormalizationOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LocalResponseNormalizationOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LSTMOptions *builtin_options_as_LSTMOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LSTMOptions
-               ? static_cast<const circle::LSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ResizeBilinearOptions
-               ? static_cast<const circle::ResizeBilinearOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ResizeBilinearOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::CallOptions *builtin_options_as_CallOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_CallOptions
-               ? static_cast<const circle::CallOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::CallOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ReshapeOptions *builtin_options_as_ReshapeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ReshapeOptions
-               ? static_cast<const circle::ReshapeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ReshapeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SkipGramOptions *builtin_options_as_SkipGramOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SkipGramOptions
-               ? static_cast<const circle::SkipGramOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SkipGramOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SpaceToDepthOptions
-               ? static_cast<const circle::SpaceToDepthOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SpaceToDepthOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::EmbeddingLookupSparseOptions *
   builtin_options_as_EmbeddingLookupSparseOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_EmbeddingLookupSparseOptions
-               ? static_cast<const circle::EmbeddingLookupSparseOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::EmbeddingLookupSparseOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::MulOptions *builtin_options_as_MulOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_MulOptions
-               ? static_cast<const circle::MulOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::MulOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::PadOptions *builtin_options_as_PadOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_PadOptions
-               ? static_cast<const circle::PadOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::PadOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::GatherOptions *builtin_options_as_GatherOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_GatherOptions
-               ? static_cast<const circle::GatherOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::GatherOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BatchToSpaceNDOptions
-               ? static_cast<const circle::BatchToSpaceNDOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BatchToSpaceNDOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SpaceToBatchNDOptions
-               ? static_cast<const circle::SpaceToBatchNDOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SpaceToBatchNDOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::TransposeOptions *builtin_options_as_TransposeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_TransposeOptions
-               ? static_cast<const circle::TransposeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::TransposeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ReducerOptions *builtin_options_as_ReducerOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ReducerOptions
-               ? static_cast<const circle::ReducerOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ReducerOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SubOptions *builtin_options_as_SubOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SubOptions
-               ? static_cast<const circle::SubOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SubOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::DivOptions *builtin_options_as_DivOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_DivOptions
-               ? static_cast<const circle::DivOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::DivOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SqueezeOptions *builtin_options_as_SqueezeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SqueezeOptions
-               ? static_cast<const circle::SqueezeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SqueezeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SequenceRNNOptions
-               ? static_cast<const circle::SequenceRNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SequenceRNNOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::StridedSliceOptions *builtin_options_as_StridedSliceOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_StridedSliceOptions
-               ? static_cast<const circle::StridedSliceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::StridedSliceOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ExpOptions *builtin_options_as_ExpOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ExpOptions
-               ? static_cast<const circle::ExpOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ExpOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::TopKV2Options *builtin_options_as_TopKV2Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_TopKV2Options
-               ? static_cast<const circle::TopKV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::TopKV2Options *>(builtin_options())
+             : nullptr;
   }
   const circle::SplitOptions *builtin_options_as_SplitOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SplitOptions
-               ? static_cast<const circle::SplitOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SplitOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LogSoftmaxOptions
-               ? static_cast<const circle::LogSoftmaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LogSoftmaxOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::CastOptions *builtin_options_as_CastOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_CastOptions
-               ? static_cast<const circle::CastOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::CastOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::DequantizeOptions *builtin_options_as_DequantizeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_DequantizeOptions
-               ? static_cast<const circle::DequantizeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::DequantizeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_MaximumMinimumOptions
-               ? static_cast<const circle::MaximumMinimumOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::MaximumMinimumOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ArgMaxOptions *builtin_options_as_ArgMaxOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ArgMaxOptions
-               ? static_cast<const circle::ArgMaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ArgMaxOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LessOptions *builtin_options_as_LessOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LessOptions
-               ? static_cast<const circle::LessOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LessOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::NegOptions *builtin_options_as_NegOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_NegOptions
-               ? static_cast<const circle::NegOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::NegOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::PadV2Options *builtin_options_as_PadV2Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_PadV2Options
-               ? static_cast<const circle::PadV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::PadV2Options *>(builtin_options())
+             : nullptr;
   }
   const circle::GreaterOptions *builtin_options_as_GreaterOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_GreaterOptions
-               ? static_cast<const circle::GreaterOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::GreaterOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_GreaterEqualOptions
-               ? static_cast<const circle::GreaterEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::GreaterEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LessEqualOptions *builtin_options_as_LessEqualOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LessEqualOptions
-               ? static_cast<const circle::LessEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LessEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SelectOptions *builtin_options_as_SelectOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SelectOptions
-               ? static_cast<const circle::SelectOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SelectOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SliceOptions *builtin_options_as_SliceOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SliceOptions
-               ? static_cast<const circle::SliceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SliceOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::TransposeConvOptions *builtin_options_as_TransposeConvOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_TransposeConvOptions
-               ? static_cast<const circle::TransposeConvOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::TransposeConvOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SparseToDenseOptions
-               ? static_cast<const circle::SparseToDenseOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SparseToDenseOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::TileOptions *builtin_options_as_TileOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_TileOptions
-               ? static_cast<const circle::TileOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::TileOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ExpandDimsOptions
-               ? static_cast<const circle::ExpandDimsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ExpandDimsOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::EqualOptions *builtin_options_as_EqualOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_EqualOptions
-               ? static_cast<const circle::EqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::EqualOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::NotEqualOptions *builtin_options_as_NotEqualOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_NotEqualOptions
-               ? static_cast<const circle::NotEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::NotEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ShapeOptions *builtin_options_as_ShapeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ShapeOptions
-               ? static_cast<const circle::ShapeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ShapeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::PowOptions *builtin_options_as_PowOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_PowOptions
-               ? static_cast<const circle::PowOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::PowOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ArgMinOptions *builtin_options_as_ArgMinOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ArgMinOptions
-               ? static_cast<const circle::ArgMinOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ArgMinOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::FakeQuantOptions *builtin_options_as_FakeQuantOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_FakeQuantOptions
-               ? static_cast<const circle::FakeQuantOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::FakeQuantOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::PackOptions *builtin_options_as_PackOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_PackOptions
-               ? static_cast<const circle::PackOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::PackOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LogicalOrOptions *builtin_options_as_LogicalOrOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LogicalOrOptions
-               ? static_cast<const circle::LogicalOrOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LogicalOrOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::OneHotOptions *builtin_options_as_OneHotOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_OneHotOptions
-               ? static_cast<const circle::OneHotOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::OneHotOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LogicalAndOptions *builtin_options_as_LogicalAndOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LogicalAndOptions
-               ? static_cast<const circle::LogicalAndOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LogicalAndOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LogicalNotOptions *builtin_options_as_LogicalNotOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LogicalNotOptions
-               ? static_cast<const circle::LogicalNotOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LogicalNotOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::UnpackOptions *builtin_options_as_UnpackOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_UnpackOptions
-               ? static_cast<const circle::UnpackOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::UnpackOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::FloorDivOptions *builtin_options_as_FloorDivOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_FloorDivOptions
-               ? static_cast<const circle::FloorDivOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::FloorDivOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SquareOptions *builtin_options_as_SquareOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SquareOptions
-               ? static_cast<const circle::SquareOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SquareOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ZerosLikeOptions *builtin_options_as_ZerosLikeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ZerosLikeOptions
-               ? static_cast<const circle::ZerosLikeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ZerosLikeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::FillOptions *builtin_options_as_FillOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_FillOptions
-               ? static_cast<const circle::FillOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::FillOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BidirectionalSequenceLSTMOptions *
   builtin_options_as_BidirectionalSequenceLSTMOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BidirectionalSequenceLSTMOptions
-               ? static_cast<const circle::BidirectionalSequenceLSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BidirectionalSequenceLSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BidirectionalSequenceRNNOptions *
   builtin_options_as_BidirectionalSequenceRNNOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BidirectionalSequenceRNNOptions
-               ? static_cast<const circle::BidirectionalSequenceRNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BidirectionalSequenceRNNOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::UnidirectionalSequenceLSTMOptions *
   builtin_options_as_UnidirectionalSequenceLSTMOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_UnidirectionalSequenceLSTMOptions
-               ? static_cast<const circle::UnidirectionalSequenceLSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::UnidirectionalSequenceLSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::FloorModOptions *builtin_options_as_FloorModOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_FloorModOptions
-               ? static_cast<const circle::FloorModOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::FloorModOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::RangeOptions *builtin_options_as_RangeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_RangeOptions
-               ? static_cast<const circle::RangeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::RangeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ResizeNearestNeighborOptions *
   builtin_options_as_ResizeNearestNeighborOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ResizeNearestNeighborOptions
-               ? static_cast<const circle::ResizeNearestNeighborOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ResizeNearestNeighborOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::LeakyReluOptions *builtin_options_as_LeakyReluOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_LeakyReluOptions
-               ? static_cast<const circle::LeakyReluOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::LeakyReluOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SquaredDifferenceOptions
-               ? static_cast<const circle::SquaredDifferenceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SquaredDifferenceOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::MirrorPadOptions *builtin_options_as_MirrorPadOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_MirrorPadOptions
-               ? static_cast<const circle::MirrorPadOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::MirrorPadOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::AbsOptions *builtin_options_as_AbsOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_AbsOptions
-               ? static_cast<const circle::AbsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::AbsOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SplitVOptions *builtin_options_as_SplitVOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SplitVOptions
-               ? static_cast<const circle::SplitVOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SplitVOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::UniqueOptions *builtin_options_as_UniqueOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_UniqueOptions
-               ? static_cast<const circle::UniqueOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::UniqueOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ReverseV2Options *builtin_options_as_ReverseV2Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ReverseV2Options
-               ? static_cast<const circle::ReverseV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ReverseV2Options *>(builtin_options())
+             : nullptr;
   }
   const circle::AddNOptions *builtin_options_as_AddNOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_AddNOptions
-               ? static_cast<const circle::AddNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::AddNOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::GatherNdOptions *builtin_options_as_GatherNdOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_GatherNdOptions
-               ? static_cast<const circle::GatherNdOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::GatherNdOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::CosOptions *builtin_options_as_CosOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_CosOptions
-               ? static_cast<const circle::CosOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::CosOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::WhereOptions *builtin_options_as_WhereOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_WhereOptions
-               ? static_cast<const circle::WhereOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::WhereOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::RankOptions *builtin_options_as_RankOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_RankOptions
-               ? static_cast<const circle::RankOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::RankOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ReverseSequenceOptions
-               ? static_cast<const circle::ReverseSequenceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ReverseSequenceOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::MatrixDiagOptions *builtin_options_as_MatrixDiagOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_MatrixDiagOptions
-               ? static_cast<const circle::MatrixDiagOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::MatrixDiagOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::QuantizeOptions *builtin_options_as_QuantizeOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_QuantizeOptions
-               ? static_cast<const circle::QuantizeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::QuantizeOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::MatrixSetDiagOptions *builtin_options_as_MatrixSetDiagOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_MatrixSetDiagOptions
-               ? static_cast<const circle::MatrixSetDiagOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::MatrixSetDiagOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::HardSwishOptions *builtin_options_as_HardSwishOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_HardSwishOptions
-               ? static_cast<const circle::HardSwishOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::HardSwishOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::IfOptions *builtin_options_as_IfOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_IfOptions
-               ? static_cast<const circle::IfOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::IfOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::WhileOptions *builtin_options_as_WhileOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_WhileOptions
-               ? static_cast<const circle::WhileOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::WhileOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_DepthToSpaceOptions
-               ? static_cast<const circle::DepthToSpaceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::DepthToSpaceOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::NonMaxSuppressionV4Options *builtin_options_as_NonMaxSuppressionV4Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_NonMaxSuppressionV4Options
-               ? static_cast<const circle::NonMaxSuppressionV4Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::NonMaxSuppressionV4Options *>(builtin_options())
+             : nullptr;
   }
   const circle::NonMaxSuppressionV5Options *builtin_options_as_NonMaxSuppressionV5Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_NonMaxSuppressionV5Options
-               ? static_cast<const circle::NonMaxSuppressionV5Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::NonMaxSuppressionV5Options *>(builtin_options())
+             : nullptr;
   }
   const circle::ScatterNdOptions *builtin_options_as_ScatterNdOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_ScatterNdOptions
-               ? static_cast<const circle::ScatterNdOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::ScatterNdOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SelectV2Options *builtin_options_as_SelectV2Options() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SelectV2Options
-               ? static_cast<const circle::SelectV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SelectV2Options *>(builtin_options())
+             : nullptr;
   }
   const circle::DensifyOptions *builtin_options_as_DensifyOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_DensifyOptions
-               ? static_cast<const circle::DensifyOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::DensifyOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::SegmentSumOptions *builtin_options_as_SegmentSumOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_SegmentSumOptions
-               ? static_cast<const circle::SegmentSumOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::SegmentSumOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BatchMatMulOptions
-               ? static_cast<const circle::BatchMatMulOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BatchMatMulOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BCQGatherOptions *builtin_options_as_BCQGatherOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BCQGatherOptions
-               ? static_cast<const circle::BCQGatherOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BCQGatherOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::BCQFullyConnectedOptions *builtin_options_as_BCQFullyConnectedOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_BCQFullyConnectedOptions
-               ? static_cast<const circle::BCQFullyConnectedOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::BCQFullyConnectedOptions *>(builtin_options())
+             : nullptr;
   }
   const circle::InstanceNormOptions *builtin_options_as_InstanceNormOptions() const
   {
     return builtin_options_type() == circle::BuiltinOptions_InstanceNormOptions
-               ? static_cast<const circle::InstanceNormOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const circle::InstanceNormOptions *>(builtin_options())
+             : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const
   {
@@ -9558,7 +9554,7 @@ struct OperatorBuilder
                             static_cast<int8_t>(custom_options_format), 0);
   }
   void add_mutating_variable_inputs(
-      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs)
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs)
   {
     fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
   }
@@ -9580,15 +9576,15 @@ struct OperatorBuilder
 };
 
 inline flatbuffers::Offset<Operator> CreateOperator(
-    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    circle::BuiltinOptions builtin_options_type = circle::BuiltinOptions_NONE,
-    flatbuffers::Offset<void> builtin_options = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
-    circle::CustomOptionsFormat custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+  circle::BuiltinOptions builtin_options_type = circle::BuiltinOptions_NONE,
+  flatbuffers::Offset<void> builtin_options = 0,
+  flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
+  circle::CustomOptionsFormat custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS,
+  flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0)
 {
   OperatorBuilder builder_(_fbb);
   builder_.add_intermediates(intermediates);
@@ -9604,20 +9600,20 @@ inline flatbuffers::Offset<Operator> CreateOperator(
 }
 
 inline flatbuffers::Offset<Operator> CreateOperatorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
-    const std::vector<int32_t> *inputs = nullptr, const std::vector<int32_t> *outputs = nullptr,
-    circle::BuiltinOptions builtin_options_type = circle::BuiltinOptions_NONE,
-    flatbuffers::Offset<void> builtin_options = 0,
-    const std::vector<uint8_t> *custom_options = nullptr,
-    circle::CustomOptionsFormat custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS,
-    const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
-    const std::vector<int32_t> *intermediates = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
+  const std::vector<int32_t> *inputs = nullptr, const std::vector<int32_t> *outputs = nullptr,
+  circle::BuiltinOptions builtin_options_type = circle::BuiltinOptions_NONE,
+  flatbuffers::Offset<void> builtin_options = 0,
+  const std::vector<uint8_t> *custom_options = nullptr,
+  circle::CustomOptionsFormat custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS,
+  const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
+  const std::vector<int32_t> *intermediates = nullptr)
 {
   auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
   auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
   auto custom_options__ = custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0;
   auto mutating_variable_inputs__ =
-      mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0;
+    mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0;
   auto intermediates__ = intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0;
   return circle::CreateOperator(_fbb, opcode_index, inputs__, outputs__, builtin_options_type,
                                 builtin_options, custom_options__, custom_options_format,
@@ -9651,7 +9647,7 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<circle::Operator>> *operators() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<circle::Operator>> *>(
-        VT_OPERATORS);
+      VT_OPERATORS);
   }
   const flatbuffers::String *name() const
   {
@@ -9693,7 +9689,7 @@ struct SubGraphBuilder
     fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
   }
   void add_operators(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>> operators)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>> operators)
   {
     fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
   }
@@ -9719,13 +9715,13 @@ struct SubGraphBuilder
 };
 
 inline flatbuffers::Offset<SubGraph> CreateSubGraph(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>> tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>> operators = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
-    circle::DataFormat data_format = circle::DataFormat_CHANNELS_LAST)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>> tensors = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>> operators = 0,
+  flatbuffers::Offset<flatbuffers::String> name = 0,
+  circle::DataFormat data_format = circle::DataFormat_CHANNELS_LAST)
 {
   SubGraphBuilder builder_(_fbb);
   builder_.add_name(name);
@@ -9738,17 +9734,17 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraph(
 }
 
 inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<circle::Tensor>> *tensors = nullptr,
-    const std::vector<int32_t> *inputs = nullptr, const std::vector<int32_t> *outputs = nullptr,
-    const std::vector<flatbuffers::Offset<circle::Operator>> *operators = nullptr,
-    const char *name = nullptr, circle::DataFormat data_format = circle::DataFormat_CHANNELS_LAST)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  const std::vector<flatbuffers::Offset<circle::Tensor>> *tensors = nullptr,
+  const std::vector<int32_t> *inputs = nullptr, const std::vector<int32_t> *outputs = nullptr,
+  const std::vector<flatbuffers::Offset<circle::Operator>> *operators = nullptr,
+  const char *name = nullptr, circle::DataFormat data_format = circle::DataFormat_CHANNELS_LAST)
 {
   auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<circle::Tensor>>(*tensors) : 0;
   auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
   auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
   auto operators__ =
-      operators ? _fbb.CreateVector<flatbuffers::Offset<circle::Operator>>(*operators) : 0;
+    operators ? _fbb.CreateVector<flatbuffers::Offset<circle::Operator>>(*operators) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return circle::CreateSubGraph(_fbb, tensors__, inputs__, outputs__, operators__, name__,
                                 data_format);
@@ -9893,12 +9889,12 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>> *operator_codes() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>> *>(
-        VT_OPERATOR_CODES);
+      VT_OPERATOR_CODES);
   }
   const flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>> *subgraphs() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>> *>(
-        VT_SUBGRAPHS);
+      VT_SUBGRAPHS);
   }
   const flatbuffers::String *description() const
   {
@@ -9915,7 +9911,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>> *metadata() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>> *>(
-        VT_METADATA);
+      VT_METADATA);
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -9939,13 +9935,13 @@ struct ModelBuilder
   flatbuffers::uoffset_t start_;
   void add_version(uint32_t version) { fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0); }
   void add_operator_codes(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>>>
-          operator_codes)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>>>
+      operator_codes)
   {
     fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
   }
   void add_subgraphs(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>> subgraphs)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>> subgraphs)
   {
     fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
   }
@@ -9963,7 +9959,7 @@ struct ModelBuilder
     fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
   }
   void add_metadata(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>> metadata)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>> metadata)
   {
     fbb_.AddOffset(Model::VT_METADATA, metadata);
   }
@@ -9981,14 +9977,14 @@ struct ModelBuilder
 };
 
 inline flatbuffers::Offset<Model> CreateModel(
-    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>>>
-        operator_codes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>> subgraphs = 0,
-    flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>> buffers = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>> metadata = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::OperatorCode>>>
+    operator_codes = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>> subgraphs = 0,
+  flatbuffers::Offset<flatbuffers::String> description = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>> buffers = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>> metadata = 0)
 {
   ModelBuilder builder_(_fbb);
   builder_.add_metadata(metadata);
@@ -10002,24 +9998,24 @@ inline flatbuffers::Offset<Model> CreateModel(
 }
 
 inline flatbuffers::Offset<Model> CreateModelDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
-    const std::vector<flatbuffers::Offset<circle::OperatorCode>> *operator_codes = nullptr,
-    const std::vector<flatbuffers::Offset<circle::SubGraph>> *subgraphs = nullptr,
-    const char *description = nullptr,
-    const std::vector<flatbuffers::Offset<circle::Buffer>> *buffers = nullptr,
-    const std::vector<int32_t> *metadata_buffer = nullptr,
-    const std::vector<flatbuffers::Offset<circle::Metadata>> *metadata = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+  const std::vector<flatbuffers::Offset<circle::OperatorCode>> *operator_codes = nullptr,
+  const std::vector<flatbuffers::Offset<circle::SubGraph>> *subgraphs = nullptr,
+  const char *description = nullptr,
+  const std::vector<flatbuffers::Offset<circle::Buffer>> *buffers = nullptr,
+  const std::vector<int32_t> *metadata_buffer = nullptr,
+  const std::vector<flatbuffers::Offset<circle::Metadata>> *metadata = nullptr)
 {
   auto operator_codes__ =
-      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<circle::OperatorCode>>(*operator_codes)
-                     : 0;
+    operator_codes ? _fbb.CreateVector<flatbuffers::Offset<circle::OperatorCode>>(*operator_codes)
+                   : 0;
   auto subgraphs__ =
-      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<circle::SubGraph>>(*subgraphs) : 0;
+    subgraphs ? _fbb.CreateVector<flatbuffers::Offset<circle::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
   auto buffers__ = buffers ? _fbb.CreateVector<flatbuffers::Offset<circle::Buffer>>(*buffers) : 0;
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
   auto metadata__ =
-      metadata ? _fbb.CreateVector<flatbuffers::Offset<circle::Metadata>>(*metadata) : 0;
+    metadata ? _fbb.CreateVector<flatbuffers::Offset<circle::Metadata>>(*metadata) : 0;
   return circle::CreateModel(_fbb, version, operator_codes__, subgraphs__, description__, buffers__,
                              metadata_buffer__, metadata__);
 }
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
index 81cd38f..63036a3 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
@@ -20,7 +20,9 @@
 
 // TODO Support multiple subgraphs
 ANeuralNetworksCompilation::ANeuralNetworksCompilation(const ANeuralNetworksModel *model) noexcept
-    : _subgraphs{model->getSubGraphs()}, _compiler{new onert::compiler::Compiler{_subgraphs}}
+  : _subgraphs{model->getSubGraphs()}, _tracing_ctx{std::make_unique<onert::util::TracingCtx>(
+                                         _subgraphs.get())},
+    _compiler{new onert::compiler::Compiler{_subgraphs, _tracing_ctx.get()}}
 {
   if (model->allowedToFp16())
   {
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
index 5f0650b..bd61f9d 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
@@ -23,6 +23,7 @@
 #include "ir/Graph.h"
 #include "ir/Subgraphs.h"
 #include "exec/IExecutor.h"
+#include "util/TracingCtx.h"
 
 struct ANeuralNetworksCompilation
 {
@@ -40,6 +41,14 @@ public:
 
 private:
   std::shared_ptr<onert::ir::Subgraphs> _subgraphs;
+  // TODO Refine the ownership of TracingCtx
+  // In case of nnfw API, nnfw_session has ownership of TracingCtx.
+  // In case of nnapi, there is no concept of session and primary model might have the ownership
+  // of TracingCtx.
+  // Since we don't support multiple models yet with nnapi in ONE, let's implement this later
+  // and let's make it work with one model for now.
+  std::unique_ptr<onert::util::TracingCtx> _tracing_ctx;
+
   std::shared_ptr<onert::compiler::Compiler> _compiler;
   std::shared_ptr<onert::exec::ExecutorMap> _executors;
 };
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksEvent.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksEvent.cc
index 2bea729..b0ea519 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksEvent.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksEvent.cc
@@ -20,7 +20,7 @@
 #include "util/logging.h"
 
 ANeuralNetworksEvent::ANeuralNetworksEvent(const std::shared_ptr<onert::exec::Execution> &execution)
-    : _execution{execution}
+  : _execution{execution}
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
index 6114b74..21c7cdd 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
@@ -140,8 +140,8 @@ bool ANeuralNetworksExecution::setInput(uint32_t index, const ANeuralNetworksOpe
 
     const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo();
     const auto shape = (type != nullptr)
-                           ? NNAPIConvert::getShape(type)
-                           : _execution->primary_subgraph().operands().at(operand_index).shape();
+                         ? NNAPIConvert::getShape(type)
+                         : _execution->primary_subgraph().operands().at(operand_index).shape();
 
     // NOTE The nnapi does not provide setting io_layout and not support changing layout. In other
     // words, we can assume that io_layout from nnapi always is the same as layout of the used
@@ -173,8 +173,8 @@ bool ANeuralNetworksExecution::setOptionalInput(uint32_t index,
 
     const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo();
     const auto shape = (type != nullptr)
-                           ? NNAPIConvert::getShape(type)
-                           : _execution->primary_subgraph().operands().at(operand_index).shape();
+                         ? NNAPIConvert::getShape(type)
+                         : _execution->primary_subgraph().operands().at(operand_index).shape();
 
     // ANeuralNetworksExecution::setInput() uses only shape information
     ANeuralNetworksOperandType optional_input_type;
@@ -208,8 +208,8 @@ bool ANeuralNetworksExecution::setOutput(uint32_t index, const ANeuralNetworksOp
 
     const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo();
     const auto shape = (type != nullptr)
-                           ? NNAPIConvert::getShape(type)
-                           : _execution->primary_subgraph().operands().at(operand_index).shape();
+                         ? NNAPIConvert::getShape(type)
+                         : _execution->primary_subgraph().operands().at(operand_index).shape();
 
     // NOTE The nnapi does not provide setting io_layout and not support changing layout. In other
     // words, we can assume that io_layout from nnapi always is the same as layout of the used
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
index 1f4b868..70c5d2a 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
@@ -27,7 +27,7 @@ struct ANeuralNetworksExecution
 {
 public:
   ANeuralNetworksExecution(const std::shared_ptr<onert::exec::ExecutorMap> &executors)
-      : _execution{std::make_shared<onert::exec::Execution>(executors)}
+    : _execution{std::make_shared<onert::exec::Execution>(executors)}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
index 97b820a..3e2bea1 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
@@ -27,7 +27,7 @@
 // ANeuralNetworksModel
 //
 ANeuralNetworksModel::ANeuralNetworksModel() noexcept
-    : _optional_operands{}, _operand_usages{}, _allowFloat32toFloat16{false}
+  : _optional_operands{}, _operand_usages{}, _allowFloat32toFloat16{false}
 {
   _graph = std::make_shared<onert::ir::Graph>();
 }
@@ -72,12 +72,12 @@ bool ANeuralNetworksModel::setOperandValue(uint32_t index, const void *buffer, s
     if (copy)
     {
       _graph->operands().at(ind).data(
-          std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(buffer), length));
+        std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(buffer), length));
     }
     else
     {
       _graph->operands().at(ind).data(
-          std::make_unique<ExternalData>(reinterpret_cast<const uint8_t *>(buffer), length));
+        std::make_unique<ExternalData>(reinterpret_cast<const uint8_t *>(buffer), length));
     }
   }
   catch (const std::exception &e)
@@ -111,9 +111,9 @@ bool ANeuralNetworksModel::addOperation(ANeuralNetworksOperationType type, uint3
     if (type == ANEURALNETWORKS_FULLY_CONNECTED)
     {
       const auto &input_operand =
-          _graph->operands().at(node->getInputs().at(onert::ir::operation::FullyConnected::INPUT));
+        _graph->operands().at(node->getInputs().at(onert::ir::operation::FullyConnected::INPUT));
       auto &weights_operand =
-          _graph->operands().at(node->getInputs().at(onert::ir::operation::FullyConnected::WEIGHT));
+        _graph->operands().at(node->getInputs().at(onert::ir::operation::FullyConnected::WEIGHT));
       if (input_operand.typeInfo().type() == onert::ir::DataType::FLOAT32 &&
           weights_operand.typeInfo().type() == onert::ir::DataType::QUANT_UINT8_ASYMM)
       {
diff --git a/runtime/onert/frontend/nnapi/wrapper/NNAPIConvert.cc b/runtime/onert/frontend/nnapi/wrapper/NNAPIConvert.cc
index 63d4e3c..94b8f02 100644
--- a/runtime/onert/frontend/nnapi/wrapper/NNAPIConvert.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/NNAPIConvert.cc
@@ -39,6 +39,13 @@ DataType NNAPIConvert::getDataType(OperandCode type)
     case ANEURALNETWORKS_BOOL:
     case ANEURALNETWORKS_TENSOR_BOOL8:
       return DataType::BOOL8;
+    case ANEURALNETWORKS_TENSOR_FLOAT16:
+    case ANEURALNETWORKS_FLOAT16:
+      return DataType::FLOAT16;
+    case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL:
+      return DataType::QUANT_INT8_SYMM_PER_CHANNEL;
+    case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED:
+      return DataType::QUANT_INT8_ASYMM;
     default:
       throw std::runtime_error("Unsupported type");
   }
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index a84ce1b..9ecb7d1 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -107,7 +107,7 @@ getElementwiseActivationGenerator(const onert::ir::operation::ElementwiseActivat
 }
 
 OperationFactory::Generator getElementwiseBinaryGenerator(
-    const onert::ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
+  const onert::ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
 {
   return [op_type](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2);
@@ -182,7 +182,7 @@ getBinaryArithmeticGenerator(const onert::ir::operation::BinaryArithmetic::Arith
     param.arithmetic_type = op_type;
     const auto activation_index = OperandIndex{init_param.inputs[2]};
     param.activation =
-        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+      NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
 
     return new operation::BinaryArithmetic{inputs, outputs, param};
   };
@@ -221,12 +221,12 @@ getPool2DGenerator(const onert::ir::operation::Pool2D::PoolType pool_type)
       const auto activation_index = OperandIndex{init_param.inputs[6]};
 
       param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
+        NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
       param.stride = makeStride(operands, hstride_index, vstride_index);
       param.kw = getUint32Scalar(operands, kw_index);
       param.kh = operands.at(kh_index).asScalar<uint32_t>();
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     else // support explicit padding
     {
@@ -259,7 +259,7 @@ getPool2DGenerator(const onert::ir::operation::Pool2D::PoolType pool_type)
       param.kw = getUint32Scalar(operands, kw_index);
       param.kh = getUint32Scalar(operands, kh_index);
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     return new operation::Pool2D{inputs, outputs, param};
   };
@@ -382,11 +382,11 @@ OperationFactory::OperationFactory()
       const auto activation_index = OperandIndex{init_param.inputs[7]};
 
       param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
+        NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
       param.stride = makeStride(operands, hstride_index, vstride_index);
       param.multiplier = getUint32Scalar(operands, multiplier_index);
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     else
     {
@@ -417,7 +417,7 @@ OperationFactory::OperationFactory()
       param.stride = makeStride(operands, hstride_index, vstride_index);
       param.multiplier = getUint32Scalar(operands, multiplier_index);
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
 
     // TODO set dilation
@@ -490,7 +490,7 @@ OperationFactory::OperationFactory()
     operation::FullyConnected::Param param;
     const auto activation_index = OperandIndex{init_param.inputs[3]};
     param.activation =
-        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+      NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     param.weights_format = FullyConnectedWeightsFormat::Default;
 
     return new operation::FullyConnected{inputs, outputs, param};
@@ -517,7 +517,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_CAST] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::CAST);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::CAST);
 
   // ANEURALNETWORKS_CAST_EX is deprecated
   // TODO Remove ANEURALNETWORKS_CAST_EX
@@ -557,14 +557,14 @@ OperationFactory::OperationFactory()
       const auto activation_index = OperandIndex{init_param.inputs[6]};
 
       param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
+        NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
       param.stride = makeStride(operands, hstride_index, vstride_index);
 
       param.dilation.width_factor = 1;
       param.dilation.height_factor = 1;
 
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     else if (init_param.input_count == 10) // support explicit padding
     {
@@ -595,7 +595,7 @@ OperationFactory::OperationFactory()
       param.dilation.height_factor = 1;
 
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     else if (init_param.input_count == 13) // support dilation
     {
@@ -633,7 +633,7 @@ OperationFactory::OperationFactory()
       param.dilation.height_factor = height_factor;
 
       param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
     }
     else
     {
@@ -644,19 +644,19 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_ADD] =
-      getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::ADD);
+    getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::ADD);
 
   _map[ANEURALNETWORKS_ADDV2_EX] = _map[ANEURALNETWORKS_ADD];
 
   _map[ANEURALNETWORKS_REDUCE_SUM] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::SUM);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::SUM);
 
   // ANEURALNETWORKS_REDUCE_SUM_EX is deprecated
   // TODO Remove ANEURALNETWORKS_REDUCE_SUM_EX
   _map[ANEURALNETWORKS_REDUCE_SUM_EX] = _map[ANEURALNETWORKS_REDUCE_SUM];
 
   _map[ANEURALNETWORKS_SUB] =
-      getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::SUB);
+    getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::SUB);
 
   _map[ANEURALNETWORKS_SLICE] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -708,7 +708,7 @@ OperationFactory::OperationFactory()
     param.begin_mask = operands.at(OperandIndex{init_param.inputs[4]}).asScalar<std::int32_t>();
     param.end_mask = operands.at(OperandIndex{init_param.inputs[5]}).asScalar<std::int32_t>();
     param.shrink_axis_mask =
-        operands.at(OperandIndex{init_param.inputs[6]}).asScalar<std::int32_t>();
+      operands.at(OperandIndex{init_param.inputs[6]}).asScalar<std::int32_t>();
 
     return new operation::StridedSlice{inputs, outputs, param};
   };
@@ -716,7 +716,7 @@ OperationFactory::OperationFactory()
   _map[ANEURALNETWORKS_TRANSPOSE] = createSimpleBinaryOp<operation::Transpose>;
 
   _map[ANEURALNETWORKS_MUL] =
-      getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::MUL);
+    getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::MUL);
 
   _map[ANEURALNETWORKS_SQUEEZE] = [](const OperationFactory::Param &init_param,
                                      Operands &operands) {
@@ -758,15 +758,15 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_TANH] = getElementwiseActivationGenerator(
-      onert::ir::operation::ElementwiseActivation::Type::TANH, 1.f, 1.f);
+    onert::ir::operation::ElementwiseActivation::Type::TANH, 1.f, 1.f);
 
   _map[ANEURALNETWORKS_LOG] = getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::LOG);
 
-  _map[ANEURALNETWORKS_LOGISTIC] = getElementwiseActivationGenerator(
-      onert::ir::operation::ElementwiseActivation::Type::LOGISTIC);
+  _map[ANEURALNETWORKS_LOGISTIC] =
+    getElementwiseActivationGenerator(onert::ir::operation::ElementwiseActivation::Type::LOGISTIC);
 
   _map[ANEURALNETWORKS_DIV] =
-      getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::DIV);
+    getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::DIV);
 
   _map[ANEURALNETWORKS_EXP] = getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::EXP);
 
@@ -780,16 +780,16 @@ OperationFactory::OperationFactory()
   _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
 
   _map[ANEURALNETWORKS_GREATER] =
-      getComparisonGenerator(operation::Comparison::ComparisonType::Greater);
+    getComparisonGenerator(operation::Comparison::ComparisonType::Greater);
   _map[ANEURALNETWORKS_GREATER_EQUAL] =
-      getComparisonGenerator(operation::Comparison::ComparisonType::GreaterEqual);
+    getComparisonGenerator(operation::Comparison::ComparisonType::GreaterEqual);
   _map[ANEURALNETWORKS_LESS] = getComparisonGenerator(operation::Comparison::ComparisonType::Less);
   _map[ANEURALNETWORKS_LESS_EQUAL] =
-      getComparisonGenerator(operation::Comparison::ComparisonType::LessEqual);
+    getComparisonGenerator(operation::Comparison::ComparisonType::LessEqual);
   _map[ANEURALNETWORKS_NOT_EQUAL] =
-      getComparisonGenerator(operation::Comparison::ComparisonType::NotEqual);
+    getComparisonGenerator(operation::Comparison::ComparisonType::NotEqual);
   _map[ANEURALNETWORKS_EQUAL] =
-      getComparisonGenerator(operation::Comparison::ComparisonType::Equal);
+    getComparisonGenerator(operation::Comparison::ComparisonType::Equal);
 
   // ANEURALNETWORKS_GREATER_EQUAL_EX is deprecated
   // TODO Remove ANEURALNETWORKS_GREATER_EQUAL_EX
@@ -838,13 +838,13 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_REDUCE_ALL] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::ALL);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::ALL);
 
   _map[ANEURALNETWORKS_REDUCE_ANY] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::ANY);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::ANY);
 
   _map[ANEURALNETWORKS_REDUCE_MAX] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::MAX);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::MAX);
 
   // ANEURALNETWORKS_REDUCE_MAX_EX is deprecated
   // TODO Remove ANEURALNETWORKS_REDUCE_MAX_EX
@@ -873,8 +873,8 @@ OperationFactory::OperationFactory()
     return new operation::Comparison{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_LOGICAL_AND] = getElementwiseBinaryGenerator(
-      operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND);
+  _map[ANEURALNETWORKS_LOGICAL_AND] =
+    getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND);
 
   // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
@@ -902,7 +902,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_RSQRT] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::RSQRT);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::RSQRT);
 
   _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -939,8 +939,8 @@ OperationFactory::OperationFactory()
   _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
 
   _map[ANEURALNETWORKS_RELU] =
-      getElementwiseActivationGenerator(onert::ir::operation::ElementwiseActivation::Type::RELU,
-                                        onert::ir::operation::ElementwiseActivation::infinity, 0);
+    getElementwiseActivationGenerator(onert::ir::operation::ElementwiseActivation::Type::RELU,
+                                      onert::ir::operation::ElementwiseActivation::infinity, 0);
 
   _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
                                              Operands &operands) {
@@ -986,10 +986,10 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_RELU1] = getElementwiseActivationGenerator(
-      onert::ir::operation::ElementwiseActivation::Type::RELU, 1.f, -1.f);
+    onert::ir::operation::ElementwiseActivation::Type::RELU, 1.f, -1.f);
 
   _map[ANEURALNETWORKS_RELU6] = getElementwiseActivationGenerator(
-      onert::ir::operation::ElementwiseActivation::Type::RELU, 6.f, 0.f);
+    onert::ir::operation::ElementwiseActivation::Type::RELU, 6.f, 0.f);
 
   _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -1031,13 +1031,13 @@ OperationFactory::OperationFactory()
     operation::RNN::Param param;
     const auto activation_index = OperandIndex{init_param.inputs[5]};
     param.activation =
-        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+      NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
 
     return new operation::RNN{inputs, outputs, param};
   };
 
   _map[ANEURALNETWORKS_FLOOR] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::FLOOR);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::FLOOR);
 
   _map[ANEURALNETWORKS_SPACE_TO_BATCH_ND] = [](const OperationFactory::Param &init_param,
                                                Operands &) {
@@ -1169,21 +1169,21 @@ OperationFactory::OperationFactory()
     const auto vstride_index = OperandIndex{init_param.inputs[5]};
 
     param.padding.type =
-        NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
+      NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
     param.stride = makeStride(operands, hstride_index, vstride_index);
 
     return new operation::TransposeConv{inputs, outputs, param};
   };
 
   _map[ANEURALNETWORKS_SQRT] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::SQRT);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::SQRT);
 
   // ANEURALNETWORKS_SQRT_EX is deprecated
   // TODO Remove ANEURALNETWORKS_SQRT_EX
   _map[ANEURALNETWORKS_SQRT_EX] = _map[ANEURALNETWORKS_SQRT];
 
-  _map[ANEURALNETWORKS_LOGICAL_OR] = getElementwiseBinaryGenerator(
-      operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR);
+  _map[ANEURALNETWORKS_LOGICAL_OR] =
+    getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR);
 
   // ANEURALNETWORKS_LOGICAL_OR_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_OR_EX
@@ -1211,7 +1211,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_LOGICAL_NOT] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::LOGICAL_NOT);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::LOGICAL_NOT);
 
   // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
@@ -1370,9 +1370,9 @@ OperationFactory::OperationFactory()
     // 2 -> Cell State Out Tensor Index
     const OperandIndex scratch_buffer_index;
     OperandIndex output_state_index =
-        init_param.output_count >= 2 ? OperandIndex{init_param.outputs[1]} : OperandIndex();
+      init_param.output_count >= 2 ? OperandIndex{init_param.outputs[1]} : OperandIndex();
     OperandIndex cell_state_index =
-        init_param.output_count >= 3 ? OperandIndex{init_param.outputs[2]} : OperandIndex();
+      init_param.output_count >= 3 ? OperandIndex{init_param.outputs[2]} : OperandIndex();
     const OperandIndex output_index = OperandIndex{init_param.outputs[0]};
     OperandIndexSequence outputs{scratch_buffer_index, output_state_index, cell_state_index,
                                  output_index};
@@ -1519,19 +1519,39 @@ OperationFactory::OperationFactory()
     //  1 -> Axis Tensor Index
     OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
 
-    operation::ArgMax::Param param;
+    operation::ArgMinMax::Param param;
     // NNAPI ARGMAX output type is always int32
     param.output_type = DataType::INT32;
+    param.is_arg_max = true;
 
-    return new operation::ArgMax{inputs, outputs, param};
+    return new operation::ArgMinMax{inputs, outputs, param};
   };
 
   // ANEURALNETWORKS_ARGMAX_EX is deprecated
   // TODO Remove ANEURALNETWORKS_ARGMAX_EX
   _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
 
+  _map[ANEURALNETWORKS_ARGMIN] = [](const OperationFactory::Param &init_param, Operands &) {
+    assert(init_param.input_count == 2 && init_param.output_count == 1);
+
+    OperandIndexSequence outputs{init_param.outputs[0]};
+
+    // Each input should be interpreted as follows:
+    //
+    //  0 -> Input Tensor Index
+    //  1 -> Axis Tensor Index
+    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+
+    operation::ArgMinMax::Param param;
+    // NNAPI ARGMIN output type is always int32
+    param.output_type = DataType::INT32;
+    param.is_arg_max = false;
+
+    return new operation::ArgMinMax{inputs, outputs, param};
+  };
+
   _map[ANEURALNETWORKS_DEQUANTIZE] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::DEQUANTIZE);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::DEQUANTIZE);
 
   _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1608,7 +1628,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_REDUCE_MIN] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::MIN);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::MIN);
 
   // ANEURALNETWORKS_REDUCE_MIN_EX is deprecated
   // TODO Remove ANEURALNETWORKS_REDUCE_MIN_EX
@@ -1689,10 +1709,10 @@ OperationFactory::OperationFactory()
   _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
 
   _map[ANEURALNETWORKS_MINIMUM] =
-      getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::MIN);
+    getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::MIN);
 
   _map[ANEURALNETWORKS_MAXIMUM] =
-      getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::MAX);
+    getElementwiseBinaryGenerator(operation::ElementwiseBinary::ElementwiseBinaryType::MAX);
 
   _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
                                         Operands &operands) {
@@ -1719,7 +1739,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_COS_EX] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::COS);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::COS);
 
   _map[ANEURALNETWORKS_SIN] = getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::SIN);
 
@@ -1733,10 +1753,10 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_REDUCE_PROD] =
-      getReduceGenerator(onert::ir::operation::Reduce::ReduceType::PROD);
+    getReduceGenerator(onert::ir::operation::Reduce::ReduceType::PROD);
 
   _map[ANEURALNETWORKS_ROUND_EX] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::ROUND);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::ROUND);
 
   _map[ANEURALNETWORKS_RANGE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1764,7 +1784,7 @@ OperationFactory::OperationFactory()
   _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
 
   _map[ANEURALNETWORKS_ZEROS_LIKE_EX] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::ZEROS_LIKE);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::ZEROS_LIKE);
   // Each input should be interpreted as follows:
   //  0 -> Input Tensor Index
   //  1 -> Multiple Tensor Index
@@ -1904,7 +1924,7 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_QUANTIZE] =
-      getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::QUANTIZE);
+    getElementwiseUnaryGenerator(operation::ElementwiseUnary::Type::QUANTIZE);
 }
 
 Operation *OperationFactory::create(ANeuralNetworksOperationType type,
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.h b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.h
index 367cf74..74e1874 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.h
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.h
@@ -40,7 +40,7 @@ public:
 
 public:
   using Generator =
-      std::function<onert::ir::Operation *(const OperationFactory::Param &, onert::ir::Operands &)>;
+    std::function<onert::ir::Operation *(const OperationFactory::Param &, onert::ir::Operands &)>;
 
 public:
   static OperationFactory &get();
diff --git a/runtime/onert/frontend/tflite/src/tflite_schema_generated.h b/runtime/onert/frontend/tflite/src/tflite_schema_generated.h
index c6e9147..8e1b84e 100644
--- a/runtime/onert/frontend/tflite/src/tflite_schema_generated.h
+++ b/runtime/onert/frontend/tflite/src/tflite_schema_generated.h
@@ -1710,9 +1710,8 @@ enum ActivationFunctionType
 inline const ActivationFunctionType (&EnumValuesActivationFunctionType())[6]
 {
   static const ActivationFunctionType values[] = {
-      ActivationFunctionType_NONE,         ActivationFunctionType_RELU,
-      ActivationFunctionType_RELU_N1_TO_1, ActivationFunctionType_RELU6,
-      ActivationFunctionType_TANH,         ActivationFunctionType_SIGN_BIT};
+    ActivationFunctionType_NONE,  ActivationFunctionType_RELU, ActivationFunctionType_RELU_N1_TO_1,
+    ActivationFunctionType_RELU6, ActivationFunctionType_TANH, ActivationFunctionType_SIGN_BIT};
   return values;
 }
 
@@ -1768,8 +1767,8 @@ enum FullyConnectedOptionsWeightsFormat
 inline const FullyConnectedOptionsWeightsFormat (&EnumValuesFullyConnectedOptionsWeightsFormat())[2]
 {
   static const FullyConnectedOptionsWeightsFormat values[] = {
-      FullyConnectedOptionsWeightsFormat_DEFAULT,
-      FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
+    FullyConnectedOptionsWeightsFormat_DEFAULT,
+    FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
   return values;
 }
 
@@ -1981,8 +1980,8 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const CustomQuantization *details_as_CustomQuantization() const
   {
     return details_type() == QuantizationDetails_CustomQuantization
-               ? static_cast<const CustomQuantization *>(details())
-               : nullptr;
+             ? static_cast<const CustomQuantization *>(details())
+             : nullptr;
   }
   int32_t quantized_dimension() const { return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const
@@ -2072,17 +2071,17 @@ CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb,
 }
 
 inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<float> *min = nullptr,
-    const std::vector<float> *max = nullptr, const std::vector<float> *scale = nullptr,
-    const std::vector<int64_t> *zero_point = nullptr,
-    QuantizationDetails details_type = QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<float> *min = nullptr,
+  const std::vector<float> *max = nullptr, const std::vector<float> *scale = nullptr,
+  const std::vector<int64_t> *zero_point = nullptr,
+  QuantizationDetails details_type = QuantizationDetails_NONE,
+  flatbuffers::Offset<void> details = 0, int32_t quantized_dimension = 0)
 {
   return onert_tflite::CreateQuantizationParameters(
-      _fbb, min ? _fbb.CreateVector<float>(*min) : 0, max ? _fbb.CreateVector<float>(*max) : 0,
-      scale ? _fbb.CreateVector<float>(*scale) : 0,
-      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0, details_type, details,
-      quantized_dimension);
+    _fbb, min ? _fbb.CreateVector<float>(*min) : 0, max ? _fbb.CreateVector<float>(*max) : 0,
+    scale ? _fbb.CreateVector<float>(*scale) : 0,
+    zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0, details_type, details,
+    quantized_dimension);
 }
 
 struct Int32Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -2272,20 +2271,20 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const Int32Vector *array_segments_as_Int32Vector() const
   {
     return array_segments_type() == SparseIndexVector_Int32Vector
-               ? static_cast<const Int32Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const Int32Vector *>(array_segments())
+             : nullptr;
   }
   const Uint16Vector *array_segments_as_Uint16Vector() const
   {
     return array_segments_type() == SparseIndexVector_Uint16Vector
-               ? static_cast<const Uint16Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const Uint16Vector *>(array_segments())
+             : nullptr;
   }
   const Uint8Vector *array_segments_as_Uint8Vector() const
   {
     return array_segments_type() == SparseIndexVector_Uint8Vector
-               ? static_cast<const Uint8Vector *>(array_segments())
-               : nullptr;
+             ? static_cast<const Uint8Vector *>(array_segments())
+             : nullptr;
   }
   SparseIndexVector array_indices_type() const
   {
@@ -2296,20 +2295,20 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const Int32Vector *array_indices_as_Int32Vector() const
   {
     return array_indices_type() == SparseIndexVector_Int32Vector
-               ? static_cast<const Int32Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const Int32Vector *>(array_indices())
+             : nullptr;
   }
   const Uint16Vector *array_indices_as_Uint16Vector() const
   {
     return array_indices_type() == SparseIndexVector_Uint16Vector
-               ? static_cast<const Uint16Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const Uint16Vector *>(array_indices())
+             : nullptr;
   }
   const Uint8Vector *array_indices_as_Uint8Vector() const
   {
     return array_indices_type() == SparseIndexVector_Uint8Vector
-               ? static_cast<const Uint8Vector *>(array_indices())
-               : nullptr;
+             ? static_cast<const Uint8Vector *>(array_indices())
+             : nullptr;
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -2435,7 +2434,7 @@ struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>> *dim_metadata() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>> *>(
-        VT_DIM_METADATA);
+      VT_DIM_METADATA);
   }
   bool Verify(flatbuffers::Verifier &verifier) const
   {
@@ -2460,7 +2459,7 @@ struct SparsityParametersBuilder
     fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
   }
   void add_dim_metadata(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>>> dim_metadata)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>>> dim_metadata)
   {
     fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
   }
@@ -2478,11 +2477,10 @@ struct SparsityParametersBuilder
 };
 
 inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>>> dim_metadata =
-        0)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<DimensionMetadata>>> dim_metadata = 0)
 {
   SparsityParametersBuilder builder_(_fbb);
   builder_.add_dim_metadata(dim_metadata);
@@ -2492,14 +2490,14 @@ inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
 }
 
 inline flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *traversal_order = nullptr,
-    const std::vector<int32_t> *block_map = nullptr,
-    const std::vector<flatbuffers::Offset<DimensionMetadata>> *dim_metadata = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *traversal_order = nullptr,
+  const std::vector<int32_t> *block_map = nullptr,
+  const std::vector<flatbuffers::Offset<DimensionMetadata>> *dim_metadata = nullptr)
 {
   return onert_tflite::CreateSparsityParameters(
-      _fbb, traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0,
-      block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0,
-      dim_metadata ? _fbb.CreateVector<flatbuffers::Offset<DimensionMetadata>>(*dim_metadata) : 0);
+    _fbb, traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0,
+    block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0,
+    dim_metadata ? _fbb.CreateVector<flatbuffers::Offset<DimensionMetadata>>(*dim_metadata) : 0);
 }
 
 struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -2619,16 +2617,16 @@ CreateTensor(flatbuffers::FlatBufferBuilder &_fbb,
 }
 
 inline flatbuffers::Offset<Tensor> CreateTensorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *shape = nullptr,
-    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0, const char *name = nullptr,
-    flatbuffers::Offset<QuantizationParameters> quantization = 0, bool is_variable = false,
-    flatbuffers::Offset<SparsityParameters> sparsity = 0,
-    const std::vector<int32_t> *shape_signature = nullptr)
+  flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *shape = nullptr,
+  TensorType type = TensorType_FLOAT32, uint32_t buffer = 0, const char *name = nullptr,
+  flatbuffers::Offset<QuantizationParameters> quantization = 0, bool is_variable = false,
+  flatbuffers::Offset<SparsityParameters> sparsity = 0,
+  const std::vector<int32_t> *shape_signature = nullptr)
 {
   return onert_tflite::CreateTensor(
-      _fbb, shape ? _fbb.CreateVector<int32_t>(*shape) : 0, type, buffer,
-      name ? _fbb.CreateString(name) : 0, quantization, is_variable, sparsity,
-      shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0);
+    _fbb, shape ? _fbb.CreateVector<int32_t>(*shape) : 0, type, buffer,
+    name ? _fbb.CreateString(name) : 0, quantization, is_variable, sparsity,
+    shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0);
 }
 
 struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -2890,10 +2888,10 @@ struct DepthwiseConv2DOptionsBuilder
 };
 
 inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME, int32_t stride_w = 0,
-    int32_t stride_h = 0, int32_t depth_multiplier = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
+  flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME, int32_t stride_w = 0,
+  int32_t stride_h = 0, int32_t depth_multiplier = 0,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  int32_t dilation_w_factor = 1, int32_t dilation_h_factor = 1)
 {
   DepthwiseConv2DOptionsBuilder builder_(_fbb);
   builder_.add_dilation_h_factor(dilation_h_factor);
@@ -2942,12 +2940,12 @@ struct ConcatEmbeddingsOptionsBuilder
     fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
   }
   void add_num_columns_per_channel(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel)
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel)
   {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
   }
   void add_embedding_dim_per_channel(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel)
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel)
   {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL,
                    embedding_dim_per_channel);
@@ -2966,9 +2964,9 @@ struct ConcatEmbeddingsOptionsBuilder
 };
 
 inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0)
 {
   ConcatEmbeddingsOptionsBuilder builder_(_fbb);
   builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
@@ -2983,9 +2981,9 @@ CreateConcatEmbeddingsOptionsDirect(flatbuffers::FlatBufferBuilder &_fbb, int32_
                                     const std::vector<int32_t> *embedding_dim_per_channel = nullptr)
 {
   return onert_tflite::CreateConcatEmbeddingsOptions(
-      _fbb, num_channels,
-      num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0,
-      embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0);
+    _fbb, num_channels,
+    num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0,
+    embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0);
 }
 
 struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -3219,9 +3217,9 @@ struct SequenceRNNOptionsBuilder
 };
 
 inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  bool asymmetric_quantize_inputs = false)
 {
   SequenceRNNOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3296,9 +3294,9 @@ struct BidirectionalSequenceRNNOptionsBuilder
 };
 
 inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    bool merge_outputs = false, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  bool merge_outputs = false, bool asymmetric_quantize_inputs = false)
 {
   BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3378,10 +3376,10 @@ struct FullyConnectedOptionsBuilder
 };
 
 inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    FullyConnectedOptionsWeightsFormat weights_format = FullyConnectedOptionsWeightsFormat_DEFAULT,
-    bool keep_num_dims = false, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  FullyConnectedOptionsWeightsFormat weights_format = FullyConnectedOptionsWeightsFormat_DEFAULT,
+  bool keep_num_dims = false, bool asymmetric_quantize_inputs = false)
 {
   FullyConnectedOptionsBuilder builder_(_fbb);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
@@ -3474,8 +3472,8 @@ struct ConcatenationOptionsBuilder
 };
 
 inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE)
+  flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE)
 {
   ConcatenationOptionsBuilder builder_(_fbb);
   builder_.add_axis(axis);
@@ -3669,7 +3667,7 @@ struct LocalResponseNormalizationOptionsBuilder
     fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
   }
   explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -3845,7 +3843,7 @@ struct UnidirectionalSequenceLSTMOptionsBuilder
                              static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
   explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -3861,10 +3859,10 @@ struct UnidirectionalSequenceLSTMOptionsBuilder
 
 inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>
 CreateUnidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    float cell_clip = 0.0f, float proj_clip = 0.0f, bool time_major = false,
-    bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  float cell_clip = 0.0f, float proj_clip = 0.0f, bool time_major = false,
+  bool asymmetric_quantize_inputs = false)
 {
   UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
@@ -3943,7 +3941,7 @@ struct BidirectionalSequenceLSTMOptionsBuilder
                              static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
   explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-      : fbb_(_fbb)
+    : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
   }
@@ -3958,10 +3956,10 @@ struct BidirectionalSequenceLSTMOptionsBuilder
 };
 
 inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    float cell_clip = 0.0f, float proj_clip = 0.0f, bool merge_outputs = false,
-    bool time_major = true, bool asymmetric_quantize_inputs = false)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+  float cell_clip = 0.0f, float proj_clip = 0.0f, bool merge_outputs = false,
+  bool time_major = true, bool asymmetric_quantize_inputs = false)
 {
   BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
@@ -4844,7 +4842,7 @@ CreateSqueezeOptionsDirect(flatbuffers::FlatBufferBuilder &_fbb,
                            const std::vector<int32_t> *squeeze_dims = nullptr)
 {
   return onert_tflite::CreateSqueezeOptions(
-      _fbb, squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0);
+    _fbb, squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0);
 }
 
 struct SplitOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -7206,7 +7204,7 @@ CreateOperatorCodeDirect(flatbuffers::FlatBufferBuilder &_fbb,
                          const char *custom_code = nullptr, int32_t version = 1)
 {
   return onert_tflite::CreateOperatorCode(
-      _fbb, builtin_code, custom_code ? _fbb.CreateString(custom_code) : 0, version);
+    _fbb, builtin_code, custom_code ? _fbb.CreateString(custom_code) : 0, version);
 }
 
 struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -7241,611 +7239,611 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const Conv2DOptions *builtin_options_as_Conv2DOptions() const
   {
     return builtin_options_type() == BuiltinOptions_Conv2DOptions
-               ? static_cast<const Conv2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const Conv2DOptions *>(builtin_options())
+             : nullptr;
   }
   const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const
   {
     return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions
-               ? static_cast<const DepthwiseConv2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const DepthwiseConv2DOptions *>(builtin_options())
+             : nullptr;
   }
   const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions
-               ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options())
+             : nullptr;
   }
   const LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LSHProjectionOptions
-               ? static_cast<const LSHProjectionOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LSHProjectionOptions *>(builtin_options())
+             : nullptr;
   }
   const Pool2DOptions *builtin_options_as_Pool2DOptions() const
   {
     return builtin_options_type() == BuiltinOptions_Pool2DOptions
-               ? static_cast<const Pool2DOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const Pool2DOptions *>(builtin_options())
+             : nullptr;
   }
   const SVDFOptions *builtin_options_as_SVDFOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SVDFOptions
-               ? static_cast<const SVDFOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SVDFOptions *>(builtin_options())
+             : nullptr;
   }
   const RNNOptions *builtin_options_as_RNNOptions() const
   {
     return builtin_options_type() == BuiltinOptions_RNNOptions
-               ? static_cast<const RNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const RNNOptions *>(builtin_options())
+             : nullptr;
   }
   const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const
   {
     return builtin_options_type() == BuiltinOptions_FullyConnectedOptions
-               ? static_cast<const FullyConnectedOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const FullyConnectedOptions *>(builtin_options())
+             : nullptr;
   }
   const SoftmaxOptions *builtin_options_as_SoftmaxOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SoftmaxOptions
-               ? static_cast<const SoftmaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SoftmaxOptions *>(builtin_options())
+             : nullptr;
   }
   const ConcatenationOptions *builtin_options_as_ConcatenationOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ConcatenationOptions
-               ? static_cast<const ConcatenationOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ConcatenationOptions *>(builtin_options())
+             : nullptr;
   }
   const AddOptions *builtin_options_as_AddOptions() const
   {
     return builtin_options_type() == BuiltinOptions_AddOptions
-               ? static_cast<const AddOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const AddOptions *>(builtin_options())
+             : nullptr;
   }
   const L2NormOptions *builtin_options_as_L2NormOptions() const
   {
     return builtin_options_type() == BuiltinOptions_L2NormOptions
-               ? static_cast<const L2NormOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const L2NormOptions *>(builtin_options())
+             : nullptr;
   }
   const LocalResponseNormalizationOptions *
   builtin_options_as_LocalResponseNormalizationOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LocalResponseNormalizationOptions
-               ? static_cast<const LocalResponseNormalizationOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LocalResponseNormalizationOptions *>(builtin_options())
+             : nullptr;
   }
   const LSTMOptions *builtin_options_as_LSTMOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LSTMOptions
-               ? static_cast<const LSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions
-               ? static_cast<const ResizeBilinearOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ResizeBilinearOptions *>(builtin_options())
+             : nullptr;
   }
   const CallOptions *builtin_options_as_CallOptions() const
   {
     return builtin_options_type() == BuiltinOptions_CallOptions
-               ? static_cast<const CallOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const CallOptions *>(builtin_options())
+             : nullptr;
   }
   const ReshapeOptions *builtin_options_as_ReshapeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ReshapeOptions
-               ? static_cast<const ReshapeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ReshapeOptions *>(builtin_options())
+             : nullptr;
   }
   const SkipGramOptions *builtin_options_as_SkipGramOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SkipGramOptions
-               ? static_cast<const SkipGramOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SkipGramOptions *>(builtin_options())
+             : nullptr;
   }
   const SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions
-               ? static_cast<const SpaceToDepthOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SpaceToDepthOptions *>(builtin_options())
+             : nullptr;
   }
   const EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const
   {
     return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions
-               ? static_cast<const EmbeddingLookupSparseOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const EmbeddingLookupSparseOptions *>(builtin_options())
+             : nullptr;
   }
   const MulOptions *builtin_options_as_MulOptions() const
   {
     return builtin_options_type() == BuiltinOptions_MulOptions
-               ? static_cast<const MulOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const MulOptions *>(builtin_options())
+             : nullptr;
   }
   const PadOptions *builtin_options_as_PadOptions() const
   {
     return builtin_options_type() == BuiltinOptions_PadOptions
-               ? static_cast<const PadOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const PadOptions *>(builtin_options())
+             : nullptr;
   }
   const GatherOptions *builtin_options_as_GatherOptions() const
   {
     return builtin_options_type() == BuiltinOptions_GatherOptions
-               ? static_cast<const GatherOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const GatherOptions *>(builtin_options())
+             : nullptr;
   }
   const BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions() const
   {
     return builtin_options_type() == BuiltinOptions_BatchToSpaceNDOptions
-               ? static_cast<const BatchToSpaceNDOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const BatchToSpaceNDOptions *>(builtin_options())
+             : nullptr;
   }
   const SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SpaceToBatchNDOptions
-               ? static_cast<const SpaceToBatchNDOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SpaceToBatchNDOptions *>(builtin_options())
+             : nullptr;
   }
   const TransposeOptions *builtin_options_as_TransposeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_TransposeOptions
-               ? static_cast<const TransposeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const TransposeOptions *>(builtin_options())
+             : nullptr;
   }
   const ReducerOptions *builtin_options_as_ReducerOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ReducerOptions
-               ? static_cast<const ReducerOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ReducerOptions *>(builtin_options())
+             : nullptr;
   }
   const SubOptions *builtin_options_as_SubOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SubOptions
-               ? static_cast<const SubOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SubOptions *>(builtin_options())
+             : nullptr;
   }
   const DivOptions *builtin_options_as_DivOptions() const
   {
     return builtin_options_type() == BuiltinOptions_DivOptions
-               ? static_cast<const DivOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const DivOptions *>(builtin_options())
+             : nullptr;
   }
   const SqueezeOptions *builtin_options_as_SqueezeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SqueezeOptions
-               ? static_cast<const SqueezeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SqueezeOptions *>(builtin_options())
+             : nullptr;
   }
   const SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SequenceRNNOptions
-               ? static_cast<const SequenceRNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SequenceRNNOptions *>(builtin_options())
+             : nullptr;
   }
   const StridedSliceOptions *builtin_options_as_StridedSliceOptions() const
   {
     return builtin_options_type() == BuiltinOptions_StridedSliceOptions
-               ? static_cast<const StridedSliceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const StridedSliceOptions *>(builtin_options())
+             : nullptr;
   }
   const ExpOptions *builtin_options_as_ExpOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ExpOptions
-               ? static_cast<const ExpOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ExpOptions *>(builtin_options())
+             : nullptr;
   }
   const TopKV2Options *builtin_options_as_TopKV2Options() const
   {
     return builtin_options_type() == BuiltinOptions_TopKV2Options
-               ? static_cast<const TopKV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const TopKV2Options *>(builtin_options())
+             : nullptr;
   }
   const SplitOptions *builtin_options_as_SplitOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SplitOptions
-               ? static_cast<const SplitOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SplitOptions *>(builtin_options())
+             : nullptr;
   }
   const LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LogSoftmaxOptions
-               ? static_cast<const LogSoftmaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LogSoftmaxOptions *>(builtin_options())
+             : nullptr;
   }
   const CastOptions *builtin_options_as_CastOptions() const
   {
     return builtin_options_type() == BuiltinOptions_CastOptions
-               ? static_cast<const CastOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const CastOptions *>(builtin_options())
+             : nullptr;
   }
   const DequantizeOptions *builtin_options_as_DequantizeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_DequantizeOptions
-               ? static_cast<const DequantizeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const DequantizeOptions *>(builtin_options())
+             : nullptr;
   }
   const MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const
   {
     return builtin_options_type() == BuiltinOptions_MaximumMinimumOptions
-               ? static_cast<const MaximumMinimumOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const MaximumMinimumOptions *>(builtin_options())
+             : nullptr;
   }
   const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ArgMaxOptions
-               ? static_cast<const ArgMaxOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ArgMaxOptions *>(builtin_options())
+             : nullptr;
   }
   const LessOptions *builtin_options_as_LessOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LessOptions
-               ? static_cast<const LessOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LessOptions *>(builtin_options())
+             : nullptr;
   }
   const NegOptions *builtin_options_as_NegOptions() const
   {
     return builtin_options_type() == BuiltinOptions_NegOptions
-               ? static_cast<const NegOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const NegOptions *>(builtin_options())
+             : nullptr;
   }
   const PadV2Options *builtin_options_as_PadV2Options() const
   {
     return builtin_options_type() == BuiltinOptions_PadV2Options
-               ? static_cast<const PadV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const PadV2Options *>(builtin_options())
+             : nullptr;
   }
   const GreaterOptions *builtin_options_as_GreaterOptions() const
   {
     return builtin_options_type() == BuiltinOptions_GreaterOptions
-               ? static_cast<const GreaterOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const GreaterOptions *>(builtin_options())
+             : nullptr;
   }
   const GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const
   {
     return builtin_options_type() == BuiltinOptions_GreaterEqualOptions
-               ? static_cast<const GreaterEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const GreaterEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const LessEqualOptions *builtin_options_as_LessEqualOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LessEqualOptions
-               ? static_cast<const LessEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LessEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const SelectOptions *builtin_options_as_SelectOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SelectOptions
-               ? static_cast<const SelectOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SelectOptions *>(builtin_options())
+             : nullptr;
   }
   const SliceOptions *builtin_options_as_SliceOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SliceOptions
-               ? static_cast<const SliceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SliceOptions *>(builtin_options())
+             : nullptr;
   }
   const TransposeConvOptions *builtin_options_as_TransposeConvOptions() const
   {
     return builtin_options_type() == BuiltinOptions_TransposeConvOptions
-               ? static_cast<const TransposeConvOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const TransposeConvOptions *>(builtin_options())
+             : nullptr;
   }
   const SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SparseToDenseOptions
-               ? static_cast<const SparseToDenseOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SparseToDenseOptions *>(builtin_options())
+             : nullptr;
   }
   const TileOptions *builtin_options_as_TileOptions() const
   {
     return builtin_options_type() == BuiltinOptions_TileOptions
-               ? static_cast<const TileOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const TileOptions *>(builtin_options())
+             : nullptr;
   }
   const ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ExpandDimsOptions
-               ? static_cast<const ExpandDimsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ExpandDimsOptions *>(builtin_options())
+             : nullptr;
   }
   const EqualOptions *builtin_options_as_EqualOptions() const
   {
     return builtin_options_type() == BuiltinOptions_EqualOptions
-               ? static_cast<const EqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const EqualOptions *>(builtin_options())
+             : nullptr;
   }
   const NotEqualOptions *builtin_options_as_NotEqualOptions() const
   {
     return builtin_options_type() == BuiltinOptions_NotEqualOptions
-               ? static_cast<const NotEqualOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const NotEqualOptions *>(builtin_options())
+             : nullptr;
   }
   const ShapeOptions *builtin_options_as_ShapeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ShapeOptions
-               ? static_cast<const ShapeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ShapeOptions *>(builtin_options())
+             : nullptr;
   }
   const PowOptions *builtin_options_as_PowOptions() const
   {
     return builtin_options_type() == BuiltinOptions_PowOptions
-               ? static_cast<const PowOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const PowOptions *>(builtin_options())
+             : nullptr;
   }
   const ArgMinOptions *builtin_options_as_ArgMinOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ArgMinOptions
-               ? static_cast<const ArgMinOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ArgMinOptions *>(builtin_options())
+             : nullptr;
   }
   const FakeQuantOptions *builtin_options_as_FakeQuantOptions() const
   {
     return builtin_options_type() == BuiltinOptions_FakeQuantOptions
-               ? static_cast<const FakeQuantOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const FakeQuantOptions *>(builtin_options())
+             : nullptr;
   }
   const PackOptions *builtin_options_as_PackOptions() const
   {
     return builtin_options_type() == BuiltinOptions_PackOptions
-               ? static_cast<const PackOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const PackOptions *>(builtin_options())
+             : nullptr;
   }
   const LogicalOrOptions *builtin_options_as_LogicalOrOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LogicalOrOptions
-               ? static_cast<const LogicalOrOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LogicalOrOptions *>(builtin_options())
+             : nullptr;
   }
   const OneHotOptions *builtin_options_as_OneHotOptions() const
   {
     return builtin_options_type() == BuiltinOptions_OneHotOptions
-               ? static_cast<const OneHotOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const OneHotOptions *>(builtin_options())
+             : nullptr;
   }
   const LogicalAndOptions *builtin_options_as_LogicalAndOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LogicalAndOptions
-               ? static_cast<const LogicalAndOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LogicalAndOptions *>(builtin_options())
+             : nullptr;
   }
   const LogicalNotOptions *builtin_options_as_LogicalNotOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LogicalNotOptions
-               ? static_cast<const LogicalNotOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LogicalNotOptions *>(builtin_options())
+             : nullptr;
   }
   const UnpackOptions *builtin_options_as_UnpackOptions() const
   {
     return builtin_options_type() == BuiltinOptions_UnpackOptions
-               ? static_cast<const UnpackOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const UnpackOptions *>(builtin_options())
+             : nullptr;
   }
   const FloorDivOptions *builtin_options_as_FloorDivOptions() const
   {
     return builtin_options_type() == BuiltinOptions_FloorDivOptions
-               ? static_cast<const FloorDivOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const FloorDivOptions *>(builtin_options())
+             : nullptr;
   }
   const SquareOptions *builtin_options_as_SquareOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SquareOptions
-               ? static_cast<const SquareOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SquareOptions *>(builtin_options())
+             : nullptr;
   }
   const ZerosLikeOptions *builtin_options_as_ZerosLikeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ZerosLikeOptions
-               ? static_cast<const ZerosLikeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ZerosLikeOptions *>(builtin_options())
+             : nullptr;
   }
   const FillOptions *builtin_options_as_FillOptions() const
   {
     return builtin_options_type() == BuiltinOptions_FillOptions
-               ? static_cast<const FillOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const FillOptions *>(builtin_options())
+             : nullptr;
   }
   const BidirectionalSequenceLSTMOptions *
   builtin_options_as_BidirectionalSequenceLSTMOptions() const
   {
     return builtin_options_type() == BuiltinOptions_BidirectionalSequenceLSTMOptions
-               ? static_cast<const BidirectionalSequenceLSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const BidirectionalSequenceLSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const
   {
     return builtin_options_type() == BuiltinOptions_BidirectionalSequenceRNNOptions
-               ? static_cast<const BidirectionalSequenceRNNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const BidirectionalSequenceRNNOptions *>(builtin_options())
+             : nullptr;
   }
   const UnidirectionalSequenceLSTMOptions *
   builtin_options_as_UnidirectionalSequenceLSTMOptions() const
   {
     return builtin_options_type() == BuiltinOptions_UnidirectionalSequenceLSTMOptions
-               ? static_cast<const UnidirectionalSequenceLSTMOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const UnidirectionalSequenceLSTMOptions *>(builtin_options())
+             : nullptr;
   }
   const FloorModOptions *builtin_options_as_FloorModOptions() const
   {
     return builtin_options_type() == BuiltinOptions_FloorModOptions
-               ? static_cast<const FloorModOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const FloorModOptions *>(builtin_options())
+             : nullptr;
   }
   const RangeOptions *builtin_options_as_RangeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_RangeOptions
-               ? static_cast<const RangeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const RangeOptions *>(builtin_options())
+             : nullptr;
   }
   const ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ResizeNearestNeighborOptions
-               ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options())
+             : nullptr;
   }
   const LeakyReluOptions *builtin_options_as_LeakyReluOptions() const
   {
     return builtin_options_type() == BuiltinOptions_LeakyReluOptions
-               ? static_cast<const LeakyReluOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const LeakyReluOptions *>(builtin_options())
+             : nullptr;
   }
   const SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SquaredDifferenceOptions
-               ? static_cast<const SquaredDifferenceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SquaredDifferenceOptions *>(builtin_options())
+             : nullptr;
   }
   const MirrorPadOptions *builtin_options_as_MirrorPadOptions() const
   {
     return builtin_options_type() == BuiltinOptions_MirrorPadOptions
-               ? static_cast<const MirrorPadOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const MirrorPadOptions *>(builtin_options())
+             : nullptr;
   }
   const AbsOptions *builtin_options_as_AbsOptions() const
   {
     return builtin_options_type() == BuiltinOptions_AbsOptions
-               ? static_cast<const AbsOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const AbsOptions *>(builtin_options())
+             : nullptr;
   }
   const SplitVOptions *builtin_options_as_SplitVOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SplitVOptions
-               ? static_cast<const SplitVOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SplitVOptions *>(builtin_options())
+             : nullptr;
   }
   const UniqueOptions *builtin_options_as_UniqueOptions() const
   {
     return builtin_options_type() == BuiltinOptions_UniqueOptions
-               ? static_cast<const UniqueOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const UniqueOptions *>(builtin_options())
+             : nullptr;
   }
   const ReverseV2Options *builtin_options_as_ReverseV2Options() const
   {
     return builtin_options_type() == BuiltinOptions_ReverseV2Options
-               ? static_cast<const ReverseV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ReverseV2Options *>(builtin_options())
+             : nullptr;
   }
   const AddNOptions *builtin_options_as_AddNOptions() const
   {
     return builtin_options_type() == BuiltinOptions_AddNOptions
-               ? static_cast<const AddNOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const AddNOptions *>(builtin_options())
+             : nullptr;
   }
   const GatherNdOptions *builtin_options_as_GatherNdOptions() const
   {
     return builtin_options_type() == BuiltinOptions_GatherNdOptions
-               ? static_cast<const GatherNdOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const GatherNdOptions *>(builtin_options())
+             : nullptr;
   }
   const CosOptions *builtin_options_as_CosOptions() const
   {
     return builtin_options_type() == BuiltinOptions_CosOptions
-               ? static_cast<const CosOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const CosOptions *>(builtin_options())
+             : nullptr;
   }
   const WhereOptions *builtin_options_as_WhereOptions() const
   {
     return builtin_options_type() == BuiltinOptions_WhereOptions
-               ? static_cast<const WhereOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const WhereOptions *>(builtin_options())
+             : nullptr;
   }
   const RankOptions *builtin_options_as_RankOptions() const
   {
     return builtin_options_type() == BuiltinOptions_RankOptions
-               ? static_cast<const RankOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const RankOptions *>(builtin_options())
+             : nullptr;
   }
   const ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ReverseSequenceOptions
-               ? static_cast<const ReverseSequenceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ReverseSequenceOptions *>(builtin_options())
+             : nullptr;
   }
   const MatrixDiagOptions *builtin_options_as_MatrixDiagOptions() const
   {
     return builtin_options_type() == BuiltinOptions_MatrixDiagOptions
-               ? static_cast<const MatrixDiagOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const MatrixDiagOptions *>(builtin_options())
+             : nullptr;
   }
   const QuantizeOptions *builtin_options_as_QuantizeOptions() const
   {
     return builtin_options_type() == BuiltinOptions_QuantizeOptions
-               ? static_cast<const QuantizeOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const QuantizeOptions *>(builtin_options())
+             : nullptr;
   }
   const MatrixSetDiagOptions *builtin_options_as_MatrixSetDiagOptions() const
   {
     return builtin_options_type() == BuiltinOptions_MatrixSetDiagOptions
-               ? static_cast<const MatrixSetDiagOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const MatrixSetDiagOptions *>(builtin_options())
+             : nullptr;
   }
   const HardSwishOptions *builtin_options_as_HardSwishOptions() const
   {
     return builtin_options_type() == BuiltinOptions_HardSwishOptions
-               ? static_cast<const HardSwishOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const HardSwishOptions *>(builtin_options())
+             : nullptr;
   }
   const IfOptions *builtin_options_as_IfOptions() const
   {
     return builtin_options_type() == BuiltinOptions_IfOptions
-               ? static_cast<const IfOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const IfOptions *>(builtin_options())
+             : nullptr;
   }
   const WhileOptions *builtin_options_as_WhileOptions() const
   {
     return builtin_options_type() == BuiltinOptions_WhileOptions
-               ? static_cast<const WhileOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const WhileOptions *>(builtin_options())
+             : nullptr;
   }
   const DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const
   {
     return builtin_options_type() == BuiltinOptions_DepthToSpaceOptions
-               ? static_cast<const DepthToSpaceOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const DepthToSpaceOptions *>(builtin_options())
+             : nullptr;
   }
   const NonMaxSuppressionV4Options *builtin_options_as_NonMaxSuppressionV4Options() const
   {
     return builtin_options_type() == BuiltinOptions_NonMaxSuppressionV4Options
-               ? static_cast<const NonMaxSuppressionV4Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const NonMaxSuppressionV4Options *>(builtin_options())
+             : nullptr;
   }
   const NonMaxSuppressionV5Options *builtin_options_as_NonMaxSuppressionV5Options() const
   {
     return builtin_options_type() == BuiltinOptions_NonMaxSuppressionV5Options
-               ? static_cast<const NonMaxSuppressionV5Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const NonMaxSuppressionV5Options *>(builtin_options())
+             : nullptr;
   }
   const ScatterNdOptions *builtin_options_as_ScatterNdOptions() const
   {
     return builtin_options_type() == BuiltinOptions_ScatterNdOptions
-               ? static_cast<const ScatterNdOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const ScatterNdOptions *>(builtin_options())
+             : nullptr;
   }
   const SelectV2Options *builtin_options_as_SelectV2Options() const
   {
     return builtin_options_type() == BuiltinOptions_SelectV2Options
-               ? static_cast<const SelectV2Options *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SelectV2Options *>(builtin_options())
+             : nullptr;
   }
   const DensifyOptions *builtin_options_as_DensifyOptions() const
   {
     return builtin_options_type() == BuiltinOptions_DensifyOptions
-               ? static_cast<const DensifyOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const DensifyOptions *>(builtin_options())
+             : nullptr;
   }
   const SegmentSumOptions *builtin_options_as_SegmentSumOptions() const
   {
     return builtin_options_type() == BuiltinOptions_SegmentSumOptions
-               ? static_cast<const SegmentSumOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const SegmentSumOptions *>(builtin_options())
+             : nullptr;
   }
   const BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const
   {
     return builtin_options_type() == BuiltinOptions_BatchMatMulOptions
-               ? static_cast<const BatchMatMulOptions *>(builtin_options())
-               : nullptr;
+             ? static_cast<const BatchMatMulOptions *>(builtin_options())
+             : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const
   {
@@ -8457,7 +8455,7 @@ struct OperatorBuilder
                             static_cast<int8_t>(custom_options_format), 0);
   }
   void add_mutating_variable_inputs(
-      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs)
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs)
   {
     fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
   }
@@ -8514,11 +8512,11 @@ CreateOperatorDirect(flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index
                      const std::vector<int32_t> *intermediates = nullptr)
 {
   return onert_tflite::CreateOperator(
-      _fbb, opcode_index, inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
-      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0, builtin_options_type, builtin_options,
-      custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0, custom_options_format,
-      mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0,
-      intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0);
+    _fbb, opcode_index, inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+    outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0, builtin_options_type, builtin_options,
+    custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0, custom_options_format,
+    mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0,
+    intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0);
 }
 
 struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -8602,12 +8600,12 @@ struct SubGraphBuilder
 };
 
 inline flatbuffers::Offset<SubGraph> CreateSubGraph(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0)
+  flatbuffers::FlatBufferBuilder &_fbb,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators = 0,
+  flatbuffers::Offset<flatbuffers::String> name = 0)
 {
   SubGraphBuilder builder_(_fbb);
   builder_.add_name(name);
@@ -8618,20 +8616,18 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraph(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SubGraph>
-CreateSubGraphDirect(flatbuffers::FlatBufferBuilder &_fbb,
-                     const std::vector<flatbuffers::Offset<Tensor>> *tensors = nullptr,
-                     const std::vector<int32_t> *inputs = nullptr,
-                     const std::vector<int32_t> *outputs = nullptr,
-                     const std::vector<flatbuffers::Offset<Operator>> *operators = nullptr,
-                     const char *name = nullptr)
+inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+  flatbuffers::FlatBufferBuilder &_fbb,
+  const std::vector<flatbuffers::Offset<Tensor>> *tensors = nullptr,
+  const std::vector<int32_t> *inputs = nullptr, const std::vector<int32_t> *outputs = nullptr,
+  const std::vector<flatbuffers::Offset<Operator>> *operators = nullptr, const char *name = nullptr)
 {
   return onert_tflite::CreateSubGraph(
-      _fbb, tensors ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(*tensors) : 0,
-      inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
-      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
-      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators) : 0,
-      name ? _fbb.CreateString(name) : 0);
+    _fbb, tensors ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(*tensors) : 0,
+    inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+    outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
+    operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators) : 0,
+    name ? _fbb.CreateString(name) : 0);
 }
 
 struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
@@ -8762,7 +8758,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes() const
   {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(
-        VT_OPERATOR_CODES);
+      VT_OPERATOR_CODES);
   }
   const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *subgraphs() const
   {
@@ -8805,7 +8801,7 @@ struct ModelBuilder
   flatbuffers::uoffset_t start_;
   void add_version(uint32_t version) { fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0); }
   void add_operator_codes(
-      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes)
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes)
   {
     fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
   }
@@ -8845,13 +8841,13 @@ struct ModelBuilder
 };
 
 inline flatbuffers::Offset<Model> CreateModel(
-    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs = 0,
-    flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Metadata>>> metadata = 0)
+  flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs = 0,
+  flatbuffers::Offset<flatbuffers::String> description = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0,
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Metadata>>> metadata = 0)
 {
   ModelBuilder builder_(_fbb);
   builder_.add_metadata(metadata);
@@ -8874,13 +8870,13 @@ CreateModelDirect(flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
                   const std::vector<flatbuffers::Offset<Metadata>> *metadata = nullptr)
 {
   return onert_tflite::CreateModel(
-      _fbb, version,
-      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(*operator_codes) : 0,
-      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs) : 0,
-      description ? _fbb.CreateString(description) : 0,
-      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0,
-      metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0,
-      metadata ? _fbb.CreateVector<flatbuffers::Offset<Metadata>>(*metadata) : 0);
+    _fbb, version,
+    operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(*operator_codes) : 0,
+    subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs) : 0,
+    description ? _fbb.CreateString(description) : 0,
+    buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0,
+    metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0,
+    metadata ? _fbb.CreateVector<flatbuffers::Offset<Metadata>>(*metadata) : 0);
 }
 
 inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj,
diff --git a/runtime/onert/sample/.clang-format b/runtime/onert/sample/.clang-format
new file mode 120000
index 0000000..83185fe
--- /dev/null
+++ b/runtime/onert/sample/.clang-format
@@ -0,0 +1 @@
+../../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/onert/test/.clang-format b/runtime/onert/test/.clang-format
new file mode 120000
index 0000000..83185fe
--- /dev/null
+++ b/runtime/onert/test/.clang-format
@@ -0,0 +1 @@
+../../../.clang-format.8
\ No newline at end of file
diff --git a/runtime/onert/test/core/compiler/Scheduler.cc b/runtime/onert/test/core/compiler/HEScheduler.cc
similarity index 95%
rename from runtime/onert/test/core/compiler/Scheduler.cc
rename to runtime/onert/test/core/compiler/HEScheduler.cc
index 50f3964..c77ebb8 100644
--- a/runtime/onert/test/core/compiler/Scheduler.cc
+++ b/runtime/onert/test/core/compiler/HEScheduler.cc
@@ -55,8 +55,7 @@ struct MockBackendCPU : public Backend
   std::unique_ptr<BackendContext>
   newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
   {
-    return std::unique_ptr<BackendContext>(
-        new BackendContext{this, nullptr, nullptr, nullptr, nullptr});
+    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
   }
 };
 
@@ -79,8 +78,7 @@ struct MockBackendGPU : public Backend
   std::unique_ptr<BackendContext>
   newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
   {
-    return std::unique_ptr<BackendContext>(
-        new BackendContext{this, nullptr, nullptr, nullptr, nullptr});
+    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
   }
 };
 
@@ -103,8 +101,7 @@ struct MockBackendNPU : public Backend
   std::unique_ptr<BackendContext>
   newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
   {
-    return std::unique_ptr<BackendContext>(
-        new BackendContext{this, nullptr, nullptr, nullptr, nullptr});
+    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
   }
 };
 
@@ -165,7 +162,7 @@ void setOperationsExecutionTime(const std::vector<const Backend *> &backends,
     for (auto &backend : backends)
       setOperationExecTime(et, backend, op_names[i], false, op_sizes[i], exec_time);
   }
-  et.uploadOperationsExecTime();
+  et.storeOperationsExecTime();
 }
 
 // Set permute time from one backend to another. This method is needed since ExecutionTime has only
@@ -195,7 +192,7 @@ void setPermutationsExecutionTime(const std::vector<const Backend *> &backends,
       setPermutationTime(et, backend, other_backend, false, operand_size, exec_time);
     }
   }
-  et.uploadOperationsExecTime();
+  et.storeOperationsExecTime();
 }
 
 //
@@ -304,7 +301,7 @@ std::shared_ptr<Graph> createBranchedGraph()
 //
 
 // SetUp/TearDown methods runs before/after each test and performs actions common for each test
-class SchedulerTest : public ::testing::Test
+class HESchedulerTest : public ::testing::Test
 {
 protected:
   void SetUp() override
@@ -359,8 +356,8 @@ protected:
   std::string _original_profiling_mode;
 };
 
-class SchedulerTestWithExecutorParam : public SchedulerTest,
-                                       public testing::WithParamInterface<std::string>
+class HESchedulerTestWithExecutorParam : public HESchedulerTest,
+                                         public testing::WithParamInterface<std::string>
 {
 };
 
@@ -369,7 +366,7 @@ class SchedulerTestWithExecutorParam : public SchedulerTest,
 //
 
 // Test scheduler behavior for straight graph with known execution time of all nodes and permutes.
-TEST_P(SchedulerTestWithExecutorParam, straight_graph_known_exec_time)
+TEST_P(HESchedulerTestWithExecutorParam, straight_graph_known_exec_time)
 {
   setExecutor(GetParam());
 
@@ -392,7 +389,7 @@ TEST_P(SchedulerTestWithExecutorParam, straight_graph_known_exec_time)
     setOperationExecTime(et, _cpu_backend, "Add", false, OPERATION_SIZE, 1);
     setOperationExecTime(et, _gpu_backend, "Sub", false, OPERATION_SIZE, 1);
     setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, 1);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
 
     // Test scheduler
     auto backend_contexts = buildBackendContexts(*graph);
@@ -422,7 +419,7 @@ TEST_P(SchedulerTestWithExecutorParam, straight_graph_known_exec_time)
 }
 
 // Test scheduler behavior for branched graph with known execution time of all nodes and permutes
-TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
+TEST_P(HESchedulerTestWithExecutorParam, branched_graph_known_exec_time)
 {
   const int64_t NPU_ET = 5000;
   setExecutor(GetParam());
@@ -432,7 +429,7 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
   auto graph(createBranchedGraph());
   subgs.push(ir::SubgraphIndex{0}, graph);
   OperationIndex add_op_idx(0), mul1_op_idx(1), mul2_op_idx(2), fc1_op_idx(3), fc2_op_idx(4),
-      sub_op_idx(5);
+    sub_op_idx(5);
 
   // Set default execution and transfer time
   setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1000);
@@ -451,7 +448,7 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
     setOperationExecTime(et, _npu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET);
     setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET + 1000);
     setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET + 1000);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
 
     // Test scheduler
     auto backend_contexts = buildBackendContexts(*graph);
@@ -463,7 +460,7 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
     if (GetParam() == PARALLEL)
     {
       branch1_expected_backend =
-          br->getBackend(mul1_op_idx)->config()->id() == "npu" ? "npu" : "gpu";
+        br->getBackend(mul1_op_idx)->config()->id() == "npu" ? "npu" : "gpu";
       branch2_expected_backend = branch1_expected_backend == "npu" ? "gpu" : "npu";
     }
 
@@ -486,7 +483,7 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
      * branching or scheduler assigns another backend to a node*/
     setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET * 3 + 1);
     setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET * 3 + 1);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
 
     // Test scheduler
     auto backend_contexts = buildBackendContexts(*graph);
@@ -504,11 +501,11 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
 
 // SchedulerTestWithExecutorParam tests are parameterized with executor name and runs three times -
 // one time for each executor
-INSTANTIATE_TEST_CASE_P(AllExecutors, SchedulerTestWithExecutorParam,
+INSTANTIATE_TEST_CASE_P(AllExecutors, HESchedulerTestWithExecutorParam,
                         testing::Values(LINEAR, DATAFLOW, PARALLEL));
 
 // Test scheduler behavior for branched graph and enabled profiling mode
-TEST_F(SchedulerTest, branched_graph_profiling_mode)
+TEST_F(HESchedulerTest, branched_graph_profiling_mode)
 {
   const int ET = 1e5;
 
@@ -521,7 +518,7 @@ TEST_F(SchedulerTest, branched_graph_profiling_mode)
   auto graph(createBranchedGraph());
   subgs.push(ir::SubgraphIndex{0}, graph);
   OperationIndex add_op_idx(0), mul1_op_idx(1), mul2_op_idx(2), fc1_op_idx(3), fc2_op_idx(4),
-      sub_op_idx(5);
+    sub_op_idx(5);
 
   // Test 1
   // Expected behaviour: scheduler assigns backends to nodes with unknown execution time
@@ -537,7 +534,7 @@ TEST_F(SchedulerTest, branched_graph_profiling_mode)
     setOperationExecTime(et, _gpu_backend, "Add", false, OPERATION_SIZE, ET);
     setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, ET + 1);
     setOperationExecTime(et, _gpu_backend, "Sub", false, OPERATION_SIZE, ET);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
 
     // Test scheduler
     auto backend_contexts = buildBackendContexts(*graph);
@@ -560,7 +557,7 @@ TEST_F(SchedulerTest, branched_graph_profiling_mode)
     setOperationExecTime(et, _cpu_backend, "Sub", false, OPERATION_SIZE, ET);
     setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, ET + 1);
     setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, ET);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
 
     // Test scheduler
     auto backend_contexts = buildBackendContexts(*graph);
diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
index 806b47e..0e742e1 100644
--- a/runtime/onert/test/core/exec/ExecInstance.cc
+++ b/runtime/onert/test/core/exec/ExecInstance.cc
@@ -21,6 +21,7 @@
 #include "compiler/Compiler.h"
 #include "exec/Execution.h"
 #include "ir/operation/BinaryArithmetic.h"
+#include "util/TracingCtx.h"
 
 namespace
 {
@@ -51,8 +52,8 @@ public:
     auto operand_rhs2 = graph->addOperand(shape, type);
     auto operand_result2 = graph->addOperand(shape, type);
     graph->operands()
-        .at(operand_rhs2)
-        .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs2_data), 16));
+      .at(operand_rhs2)
+      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs2_data), 16));
     // 2nd add operations (result2 <= result1 + rhs2)
     operation::BinaryArithmetic::Param param1;
     param1.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
@@ -60,14 +61,14 @@ public:
     auto input_set1 = OperandIndexSequence{operand_lhs, operand_rhs1};
     auto output_set1 = OperandIndexSequence{operand_result1};
     graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
+      std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
     operation::BinaryArithmetic::Param param2;
     param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
     param2.activation = Activation::NONE;
     auto input_set2 = OperandIndexSequence{operand_result1, operand_rhs2};
     auto output_set2 = OperandIndexSequence{operand_result2};
     graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
+      std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
     // Identify model inputs and outputs
     graph->addInput(operand_lhs);
     graph->addInput(operand_rhs1);
@@ -77,13 +78,15 @@ public:
     // Compile
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, graph);
-    onert::compiler::Compiler compiler{subgs};
+    tracing_ctx = std::make_unique<onert::util::TracingCtx>(subgs.get());
+    onert::compiler::Compiler compiler{subgs, tracing_ctx.get()};
     executors = compiler.compile();
   }
 
 public:
   std::shared_ptr<Graph> graph;
   std::shared_ptr<onert::exec::ExecutorMap> executors;
+  std::unique_ptr<onert::util::TracingCtx> tracing_ctx;
 };
 
 TEST(ExecInstance, simple)
@@ -137,7 +140,8 @@ TEST(ExecInstance, twoCompile)
   // Make new executor: compile again
   auto subgs = std::make_shared<onert::ir::Subgraphs>();
   subgs->push(onert::ir::SubgraphIndex{0}, graph);
-  onert::compiler::Compiler compiler{subgs};
+  auto tracing_ctx = std::make_unique<onert::util::TracingCtx>(subgs.get());
+  onert::compiler::Compiler compiler{subgs, tracing_ctx.get()};
   std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
   onert::exec::Execution execution2{executors2};
 
@@ -205,7 +209,7 @@ class Inference
 public:
   Inference(const float (&input1)[4], const float (&input2)[4], float (&output)[4],
             std::shared_ptr<onert::exec::ExecutorMap> &executors)
-      : _input1{input1}, _input2{input2}, _output{output}, _executors{executors}
+    : _input1{input1}, _input2{input2}, _output{output}, _executors{executors}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/test/core/exec/ExecTime.test.cc b/runtime/onert/test/core/exec/ExecTime.test.cc
index 8c2e34d..6b0c35a 100644
--- a/runtime/onert/test/core/exec/ExecTime.test.cc
+++ b/runtime/onert/test/core/exec/ExecTime.test.cc
@@ -62,7 +62,7 @@ TEST(ExecTime, roundtrip_ok)
     et.updateOperationExecTime(b, "op1", true, 100, 100);
     et.updateOperationExecTime(b, "op1", true, 200, 200);
     et.updateOperationExecTime(b, "op1", false, 100, 888);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
   }
   {
     ExecTime et(bs);
@@ -73,7 +73,7 @@ TEST(ExecTime, roundtrip_ok)
     ASSERT_EQ(time, 150);
     time = et.getOperationExecTime(b, "op1", false, 100);
     ASSERT_EQ(time, 888);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
   }
   // clean up
   EXPECT_EQ(remove("exec_time.json"), 0);
@@ -88,7 +88,7 @@ TEST(ExecTime, structure)
     ExecTime et(bs);
     et.updateOperationExecTime(b, "op1", true, 100, 100);
     et.updateOperationExecTime(b, "op1", true, 200, 200);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
   }
   {
     ExecTime et(bs);
@@ -97,7 +97,7 @@ TEST(ExecTime, structure)
     // Check interpolation
     time = et.getOperationExecTime(b, "op1", true, 200);
     ASSERT_EQ(time, 200);
-    et.uploadOperationsExecTime();
+    et.storeOperationsExecTime();
   }
   // clean up
   EXPECT_EQ(remove("exec_time.json"), 0);
diff --git a/runtime/onert/test/core/interp/ExecManager.cc b/runtime/onert/test/core/interp/ExecManager.cc
index 0c7b1b7..327c38f 100644
--- a/runtime/onert/test/core/interp/ExecManager.cc
+++ b/runtime/onert/test/core/interp/ExecManager.cc
@@ -63,7 +63,7 @@ protected:
     auto input_set = OperandIndexSequence{operand_lhs, operand_rhs};
     auto output_set = OperandIndexSequence{operand_result};
     _graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
+      std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
 
     // Identify model inputs and outputs
 
@@ -79,7 +79,7 @@ protected:
 
     _executors = std::make_shared<ExecutorMap>();
     _executors->insert(
-        std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
+      std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
   }
 
   void CreateTwoStepModel()
@@ -109,8 +109,8 @@ protected:
     auto operand_rhs2 = _graph->addOperand(shape, type);
     auto operand_result2 = _graph->addOperand(shape, type);
     _graph->operands()
-        .at(operand_rhs2)
-        .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs2_data), 16));
+      .at(operand_rhs2)
+      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs2_data), 16));
 
     // 2nd add operations (result2 <= result1 + rhs2)
 
@@ -120,7 +120,7 @@ protected:
     auto input_set1 = OperandIndexSequence{operand_lhs, operand_rhs1};
     auto output_set1 = OperandIndexSequence{operand_result1};
     _graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
+      std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
 
     operation::BinaryArithmetic::Param param2;
     param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
@@ -128,7 +128,7 @@ protected:
     auto input_set2 = OperandIndexSequence{operand_result1, operand_rhs2};
     auto output_set2 = OperandIndexSequence{operand_result2};
     _graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
+      std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
 
     // Identify model inputs and outputs
 
@@ -144,7 +144,7 @@ protected:
 
     _executors = std::make_shared<ExecutorMap>();
     _executors->insert(
-        std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
+      std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
   }
 
   void CreateUnspecifiedDimensionsModel()
@@ -168,9 +168,8 @@ protected:
 
     auto operand_activation = _graph->addOperand(shape_scalar, type_scalar);
     _graph->operands()
-        .at(operand_activation)
-        .data(
-            std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&_activation_value), 4));
+      .at(operand_activation)
+      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&_activation_value), 4));
 
     auto operand_result = _graph->addOperand(shape, type);
 
@@ -182,7 +181,7 @@ protected:
     auto input_set = OperandIndexSequence{operand_lhs, operand_rhs};
     auto output_set = OperandIndexSequence{operand_result};
     _graph->addOperation(
-        std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
+      std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
 
     // Identify model inputs and outputs
 
@@ -198,7 +197,7 @@ protected:
 
     _executors = std::make_shared<ExecutorMap>();
     _executors->insert(
-        std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
+      std::make_pair(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph)));
   }
 
   void createExecution() { _execution = std::make_unique<Execution>(_executors); }
diff --git a/runtime/onert/test/graph/MockNode.h b/runtime/onert/test/graph/MockNode.h
index 60b4719..0e7ed97 100644
--- a/runtime/onert/test/graph/MockNode.h
+++ b/runtime/onert/test/graph/MockNode.h
@@ -30,7 +30,7 @@ class SimpleMock : public onert::ir::Operation
 public:
   SimpleMock(const onert::ir::OperandIndexSequence &inputs,
              const onert::ir::OperandIndexSequence &outputs)
-      : Operation{onert::ir::OperandConstraint::createAny()}
+    : Operation{onert::ir::OperandConstraint::createAny()}
   {
     setInputs(inputs);
     setOutputs(outputs);
diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/graph/operand/UseDef.cc
index 206e402..5ef1002 100644
--- a/runtime/onert/test/graph/operand/UseDef.cc
+++ b/runtime/onert/test/graph/operand/UseDef.cc
@@ -49,16 +49,16 @@ TEST(ir_Operand, neg_usedef)
   // MockNode1
   auto operand_index1 = graph.addOperand(shape, type);
   auto mocknode_index1 =
-      graph.addOperation(std::make_unique<Mock>(IndexSet{input_operand}, IndexSet{operand_index1}));
+    graph.addOperation(std::make_unique<Mock>(IndexSet{input_operand}, IndexSet{operand_index1}));
 
   // MockNode2
   auto operand_index2 = graph.addOperand(shape, type);
   auto mocknode_index2 =
-      graph.addOperation(std::make_unique<Mock>(IndexSet{input_operand}, IndexSet{operand_index2}));
+    graph.addOperation(std::make_unique<Mock>(IndexSet{input_operand}, IndexSet{operand_index2}));
 
   // MockNode3(two input)
   auto multiinput_index = graph.addOperation(
-      std::make_unique<Mock>(IndexSet{operand_index1, operand_index2}, IndexSet{output_operand}));
+    std::make_unique<Mock>(IndexSet{operand_index1, operand_index2}, IndexSet{output_operand}));
 
   graph.finishBuilding();
 
diff --git a/runtime/onert/test/util/ShapeInference.cc b/runtime/onert/test/util/ShapeInference.cc
index f1cbfd6..2ecaa28 100644
--- a/runtime/onert/test/util/ShapeInference.cc
+++ b/runtime/onert/test/util/ShapeInference.cc
@@ -48,7 +48,7 @@ TEST(ShapeInference, Pool2DNodeSame)
   Padding padding{PaddingType::SAME};
 
   operation::Pool2D::Param avg_pool_param{
-      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
   auto infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, avg_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -58,7 +58,7 @@ TEST(ShapeInference, Pool2DNodeSame)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20);
 
   operation::Pool2D::Param max_pool_param{
-      operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
   infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, max_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -75,7 +75,7 @@ TEST(ShapeInference, Pool2DNodeValid)
   Padding padding{PaddingType::VALID};
 
   operation::Pool2D::Param avg_pool_param{
-      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
   auto infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, avg_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -85,7 +85,7 @@ TEST(ShapeInference, Pool2DNodeValid)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20);
 
   operation::Pool2D::Param max_pool_param{
-      operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
   infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, max_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -103,7 +103,7 @@ TEST(ShapeInference, Pool2DNodeExplicit)
   Padding padding{4, 3, 2, 1};
 
   operation::Pool2D::Param avg_pool_param{
-      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
   auto infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, avg_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -113,7 +113,7 @@ TEST(ShapeInference, Pool2DNodeExplicit)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20);
 
   operation::Pool2D::Param max_pool_param{
-      operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::MAX, 3, 6, stride, padding, Activation::NONE};
   infered_out_shape = onert::shape_inference::inferPoolShape(in_shape, max_pool_param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -130,7 +130,7 @@ TEST(ShapeInference, neg_Pool2DNode_InvalidStride)
   Padding padding{PaddingType::SAME};
 
   operation::Pool2D::Param avg_pool_param{
-      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+    operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
   ASSERT_THROW(onert::shape_inference::inferPoolShape(in_shape, avg_pool_param),
                std::runtime_error);
 }
@@ -161,7 +161,7 @@ TEST(ShapeInference, Conv2D)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 30);
 
   param =
-      operation::Conv2D::Param{Stride{3, 7}, Padding{4, 3, 2, 1}, Activation::NONE, Dilation{1, 1}};
+    operation::Conv2D::Param{Stride{3, 7}, Padding{4, 3, 2, 1}, Activation::NONE, Dilation{1, 1}};
   infered_out_shape = onert::shape_inference::inferConv2DShape(in_shape, ker_shape, param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
@@ -190,7 +190,7 @@ TEST(ShapeInference, DepthwiseConv2D)
   operation::DepthwiseConv2D::Param param{Stride{3, 7}, Padding{PaddingType::VALID}, 3,
                                           Activation::NONE, Dilation{1, 1}};
   auto infered_out_shape =
-      onert::shape_inference::inferDepthwiseConv2DShape(in_shape, ker_shape, param);
+    onert::shape_inference::inferDepthwiseConv2DShape(in_shape, ker_shape, param);
 
   ASSERT_EQ(infered_out_shape.rank(), 4);
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).N, 10);
@@ -364,7 +364,7 @@ TEST(ShapeInference, Transpose)
     ASSERT_EQ(in_shape.rank(), perm.size());
     ASSERT_EQ(expected.rank(), perm.size());
     auto inferred_out_shape =
-        onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size());
+      onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size());
     // post-conditions
     ASSERT_EQ(inferred_out_shape.rank(), perm.size());
     for (int32_t dim = 0; dim < expected.rank(); dim++)
@@ -479,8 +479,8 @@ TEST(ShapeInference, BCQFullyConnected)
 {
   auto check = [&](Shape &in_shape, Shape &cluster_shape, std::vector<int> cluster,
                    Shape &expected) {
-    auto actual = onert::shape_inference::inferBCQFullyConnectedShape(in_shape, cluster_shape,
-                                                                      cluster.data());
+    auto actual =
+      onert::shape_inference::inferBCQFullyConnectedShape(in_shape, cluster_shape, cluster.data());
     ASSERT_EQ(actual.rank(), expected.rank());
 
     for (int32_t dim = 0; dim < expected.rank(); dim++)
diff --git a/tests/.clang-format b/tests/.clang-format
new file mode 120000
index 0000000..0ff66f3
--- /dev/null
+++ b/tests/.clang-format
@@ -0,0 +1 @@
+../.clang-format.8
\ No newline at end of file
diff --git a/tests/custom_op/FillFrom/FillFrom_runner.cc b/tests/custom_op/FillFrom/FillFrom_runner.cc
index 7313086..6b09d5d 100644
--- a/tests/custom_op/FillFrom/FillFrom_runner.cc
+++ b/tests/custom_op/FillFrom/FillFrom_runner.cc
@@ -87,7 +87,7 @@ std::vector<float> genData(uint64_t size)
 
 template <typename InIter1, typename InIter2>
 static auto findMaxDifference(InIter1 first1, InIter1 last1, InIter2 first2)
-    -> decltype(*first1 - *first2)
+  -> decltype(*first1 - *first2)
 {
   auto max_difference = std::abs(*first1 - *first2);
   for (; first1 != last1; ++first1, ++first2)
@@ -227,7 +227,7 @@ int main(const int argc, char **argv)
 
   const float tolerance = 0.01f;
   auto max_difference =
-      findMaxDifference(outputs[0].begin(), outputs[0].end(), std::begin(ref_data));
+    findMaxDifference(outputs[0].begin(), outputs[0].end(), std::begin(ref_data));
 
   int exit_code = 0;
   if (max_difference > tolerance)
diff --git a/tests/nnapi/CMakeLists.txt b/tests/nnapi/CMakeLists.txt
index b1215d8..67ac90f 100644
--- a/tests/nnapi/CMakeLists.txt
+++ b/tests/nnapi/CMakeLists.txt
@@ -7,6 +7,16 @@ if (NOT BUILD_ONERT)
   return()
 endif(NOT BUILD_ONERT)
 
+# GCC Compiler under 6.2 is not support this test build
+if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.2)
+  return()
+endif()
+
+if (ANDROID_BOOST_ROOT)
+  set(BOOST_ROOT ${ANDROID_BOOST_ROOT})
+endif (ANDROID_BOOST_ROOT)
+
+nnfw_find_package(Boost REQUIRED)
 nnfw_find_package(GTest)
 
 
@@ -46,6 +56,7 @@ endif(GENERATE_RUNTIME_NNAPI_TESTS)
 set(RUNTIME_NNAPI_TEST_SRC_INC ${CMAKE_CURRENT_SOURCE_DIR}/include
                                ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(${RUNTIME_NNAPI_TEST} PRIVATE ${RUNTIME_NNAPI_TEST_SRC_INC})
+target_include_directories(${RUNTIME_NNAPI_TEST} PRIVATE ${Boost_INCLUDE_DIRS})
 
 # Define NNTEST_ONLY_PUBLIC_API to avoid android dependency
 target_compile_definitions(${RUNTIME_NNAPI_TEST} PRIVATE NNTEST_ONLY_PUBLIC_API)
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_cl
new file mode 100644
index 0000000..4e4d688
--- /dev/null
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_cl
@@ -0,0 +1,305 @@
+GeneratedTests.abs_
+GeneratedTests.abs_dynamic_nnfw
+GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
+GeneratedTests.argmax_dynamic_nnfw
+GeneratedTests.batch_matmul_ex_dynamic_nnfw
+GeneratedTests.batch_matmul_ex_float_adj_x
+GeneratedTests.batch_matmul_ex_float_adj_y
+GeneratedTests.batch_matmul_ex_float_batch2
+GeneratedTests.batch_matmul_ex_float_broadcast
+GeneratedTests.batch_matmul_ex_float_broadcast2_adj_xy
+GeneratedTests.batch_matmul_ex_float_broadcast_adj_x
+GeneratedTests.batch_matmul_ex_float_simple
+GeneratedTests.broadcast_to_ex_1D_nnfw
+GeneratedTests.broadcast_to_ex_2D_nnfw
+GeneratedTests.broadcast_to_ex_dynamic_2D_nnfw
+GeneratedTests.broadcast_to_ex_dynamic_3D_nnfw
+GeneratedTests.cast_dynamic_float32_to_int32_nnfw
+GeneratedTests.cast_float16_to_float16
+GeneratedTests.cast_float16_to_float32
+GeneratedTests.cast_float16_to_float32_relaxed
+GeneratedTests.cast_float16_to_int32
+GeneratedTests.cast_float16_to_quant8
+GeneratedTests.cast_float16_to_quant8_overflow
+GeneratedTests.cast_float32_to_float16
+GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
+GeneratedTests.cast_int32_to_float16
+GeneratedTests.cast_quant8_to_float16
+GeneratedTests.concat_dynamic_nnfw
+GeneratedTests.conv_dynamic_nnfw
+GeneratedTests.conv_float_channels_weights_as_inputs
+GeneratedTests.conv_float_channels_weights_as_inputs_relaxed
+GeneratedTests.conv_float_large_weights_as_inputs
+GeneratedTests.conv_float_large_weights_as_inputs_relaxed
+GeneratedTests.conv_float_weights_as_inputs
+GeneratedTests.conv_float_weights_as_inputs_relaxed
+GeneratedTests.conv_quant8_channels_weights_as_inputs
+GeneratedTests.conv_quant8_large_weights_as_inputs
+GeneratedTests.conv_quant8_overflow_weights_as_inputs
+GeneratedTests.conv_quant8_weights_as_inputs
+GeneratedTests.conv2d_dilation_nnfw
+GeneratedTests.conv2d_dilation_nnfw_quant8
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_quant8
+GeneratedTests.conv2d_dilation_nnfw_2
+GeneratedTests.conv2d_dilation_nnfw_quant8_2
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_2
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_quant8_2
+GeneratedTests.cos_ex_1D_float_nnfw
+GeneratedTests.cos_ex_4D_float_nnfw
+GeneratedTests.cos_ex_dynamic_nnfw
+GeneratedTests.dequantize_v1_2_3d_quant8_symm
+GeneratedTests.dequantize_v1_2_4d_quant8_symm
+GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
+GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
+GeneratedTests.dequantize_v1_2
+GeneratedTests.dequantize_v1_2_zero_sized
+GeneratedTests.dequantize_v1_2_zero_sized_float16
+GeneratedTests.div_dynamic_nnfw
+GeneratedTests.einsum_ex_float_matmul_2x2_2
+GeneratedTests.einsum_ex_float_matmul_3x2_3
+GeneratedTests.einsum_ex_float_matmul_3x3_4
+GeneratedTests.einsum_ex_float_matmul_4x4_4
+GeneratedTests.einsum_ex_float_matmul_4x4_4_2
+GeneratedTests.equal_dynamic_float_nnfw
+GeneratedTests.exp_
+GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
+GeneratedTests.fill_ex_1D_float
+GeneratedTests.fill_ex_4D_float
+GeneratedTests.fill_ex_dynamic_nnfw
+GeneratedTests.fully_connected_dynamic_nnfw
+GeneratedTests.fully_connected_float_2_weights_as_inputs
+GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
+GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
+GeneratedTests.gather_dynamic_nnfw
+GeneratedTests.gather_float16
+GeneratedTests.gather_float16_2
+GeneratedTests.gather_float16_3
+GeneratedTests.gather_float16_4
+GeneratedTests.gather_float16_5
+GeneratedTests.gather_float16_6
+GeneratedTests.gather_float16_7
+GeneratedTests.gather_float16_8
+GeneratedTests.greater_dynamic_float_nnfw
+GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
+GeneratedTests.less_dynamic_float_nnfw
+GeneratedTests.less_equal_dynamic_float_nnfw
+GeneratedTests.log_4D_float_nnfw
+GeneratedTests.log_dynamic_nnfw
+GeneratedTests.log_softmax_nnfw
+GeneratedTests.log_softmax_nnfw_2
+GeneratedTests.log_softmax_nnfw_3
+GeneratedTests.log_softmax_nnfw_4
+GeneratedTests.log_softmax_nnfw_5
+GeneratedTests.log_softmax_nnfw_quant8
+GeneratedTests.logical_not
+GeneratedTests.logical_not_1D_nnfw
+GeneratedTests.logical_not_4D_nnfw
+GeneratedTests.logical_not_dynamic_nnfw
+GeneratedTests.logical_or_broadcast
+GeneratedTests.logical_or_dynamic_nnfw
+GeneratedTests.logistic_dynamic_nnfw
+GeneratedTests.lsh_projection
+GeneratedTests.lsh_projection_2
+GeneratedTests.lsh_projection_weights_as_inputs
+GeneratedTests.lstm
+GeneratedTests.lstm2
+GeneratedTests.lstm2_state
+GeneratedTests.lstm2_state2
+GeneratedTests.lstm3
+GeneratedTests.lstm3_state
+GeneratedTests.lstm3_state2
+GeneratedTests.lstm3_state3
+GeneratedTests.lstm_state
+GeneratedTests.lstm_state2
+GeneratedTests.matrix_band_part_ex_4D_float
+GeneratedTests.matrix_band_part_ex_dynamic_nnfw
+GeneratedTests.maximum_dynamic_nnfw
+GeneratedTests.minimum_dynamic_nnfw
+GeneratedTests.minimum_int32
+GeneratedTests.mul_dynamic_nnfw
+GeneratedTests.neg
+GeneratedTests.neg_dynamic_nnfw
+GeneratedTests.not_equal_dynamic_float_nnfw
+GeneratedTests.one_hot_ex_dynamic_nnfw
+GeneratedTests.pack_ex_dynamic_nnfw
+GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
+GeneratedTests.pow_2D_float_nnfw
+GeneratedTests.pow_broadcast_float_nnfw
+GeneratedTests.pow_broadcast_float_nnfw_2
+GeneratedTests.pow_broadcast_float_nnfw_3
+GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
+GeneratedTests.range_ex_float_1
+GeneratedTests.range_ex_float_1_all_constant_inputs
+GeneratedTests.range_ex_float_1_dynamic_nnfw
+GeneratedTests.range_ex_float_2
+GeneratedTests.range_ex_float_2_dynamic_nnfw
+GeneratedTests.reduce_all
+GeneratedTests.reduce_all_2
+GeneratedTests.reduce_all_2D_nnfw
+GeneratedTests.reduce_all_3
+GeneratedTests.reduce_all_4D_nnfw
+GeneratedTests.reduce_all_dynamic_nnfw
+GeneratedTests.reduce_any
+GeneratedTests.reduce_any_2
+GeneratedTests.reduce_any_2D_nnfw
+GeneratedTests.reduce_any_3
+GeneratedTests.reduce_any_4D_nnfw
+GeneratedTests.reduce_mean_dynamic_1_nnfw
+GeneratedTests.reduce_mean_dynamic_2_nnfw
+GeneratedTests.reduce_min_dynamic_nnfw
+GeneratedTests.reduce_prod
+GeneratedTests.reduce_prod_2
+GeneratedTests.reduce_prod_2D_float_nnfw
+GeneratedTests.reduce_prod_3
+GeneratedTests.reduce_prod_4
+GeneratedTests.reduce_prod_4D_float_nnfw
+GeneratedTests.reduce_prod_4D_float_reducing_C_nnfw
+GeneratedTests.reduce_prod_4D_float_reducing_HW_nnfw
+GeneratedTests.reduce_prod_dynamic_1_nnfw
+GeneratedTests.reduce_prod_dynamic_2_nnfw
+GeneratedTests.reduce_sum_dynamic_1_nnfw
+GeneratedTests.reduce_sum_dynamic_2_nnfw
+GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
+GeneratedTests.reverse_ex_1d
+GeneratedTests.reverse_ex_3d
+GeneratedTests.reverse_ex_dynamic_1D
+GeneratedTests.reverse_ex_dynamic_3D
+GeneratedTests.rnn
+GeneratedTests.rnn_state
+GeneratedTests.round_ex_1D_float
+GeneratedTests.round_ex_4D_float
+GeneratedTests.round_ex_dynamic_nnfw
+GeneratedTests.rsqrt
+GeneratedTests.rsqrt_dynamic_nnfw
+GeneratedTests.select_v1_2_five_dim
+GeneratedTests.select_v1_2_five_dim_quant8
+GeneratedTests.select_v1_2_one_dim
+GeneratedTests.select_v1_2_one_dim_quant8
+GeneratedTests.select_v1_2_two_dim
+GeneratedTests.select_v1_2_two_dim_quant8
+GeneratedTests.select_v2_ex_broadcast_1d_single_value
+GeneratedTests.select_v2_ex_broadcast_2d_one
+GeneratedTests.select_v2_ex_broadcast_2d_two
+GeneratedTests.select_v2_ex_broadcast_2d_two_dynamic_nnfw
+GeneratedTests.select_v2_ex_broadcast_less_4d
+GeneratedTests.select_v2_ex_float
+GeneratedTests.shape_ex_dynamic_nnfw
+GeneratedTests.sin_1D_float_nnfw
+GeneratedTests.sin_4D_float_nnfw
+GeneratedTests.sin_dynamic_nnfw
+GeneratedTests.slice
+GeneratedTests.slice_2
+GeneratedTests.slice_3
+GeneratedTests.slice_4
+GeneratedTests.slice_5
+GeneratedTests.slice_6
+GeneratedTests.slice_7
+GeneratedTests.slice_8
+GeneratedTests.slice_dynamic_nnfw
+GeneratedTests.slice_zero_sized
+GeneratedTests.slice_zero_sized_quant8
+GeneratedTests.softmax_dynamic_nnfw
+GeneratedTests.space_to_batch_dynamic_float_nnfw
+GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
+GeneratedTests.sqrt_
+GeneratedTests.squared_difference_ex_dynamic_nnfw
+GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
+GeneratedTests.strided_slice_dynamic_nnfw
+GeneratedTests.sub_dynamic_nnfw
+GeneratedTests.sub_v1_2_zero_sized
+GeneratedTests.sub_v1_2_zero_sized_quant8
+GeneratedTests.svdf
+GeneratedTests.svdf2
+GeneratedTests.svdf_bias_present
+GeneratedTests.svdf_state
+GeneratedTests.tanh_v1_2
+GeneratedTests.tanh_v1_2_zero_sized
+GeneratedTests.tanh_v1_2_zero_sized_quant8
+GeneratedTests.tanh_v1_dynamic_nnfw
+GeneratedTests.tile_1
+GeneratedTests.tile_1_dynamic_float32_nnfw
+GeneratedTests.tile_1_float16
+GeneratedTests.tile_1_quant8
+GeneratedTests.tile_2
+GeneratedTests.tile_2_dynamic_float32_nnfw
+GeneratedTests.tile_2_float16
+GeneratedTests.tile_2_int32
+GeneratedTests.tile_2_quant8
+GeneratedTests.tile_3
+GeneratedTests.tile_3_dynamic_float32_nnfw
+GeneratedTests.tile_3_float16
+GeneratedTests.tile_3_int32
+GeneratedTests.tile_3_quant8
+GeneratedTests.transpose_dynamic_nnfw
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
+GeneratedTests.transpose_v1_2_zero_sized
+GeneratedTests.transpose_v1_2_zero_sized_quant8
+GeneratedTests.unidirectional_sequence_lstm_1step
+GeneratedTests.unidirectional_sequence_lstm_batch_major_norm_peephole_projection
+GeneratedTests.unidirectional_sequence_lstm_batch_major_peephole_projection_bias
+GeneratedTests.unidirectional_sequence_lstm_dynamic_nnfw
+GeneratedTests.unidirectional_sequence_lstm_layer_norm_cifg_peephole
+GeneratedTests.unidirectional_sequence_lstm_norm_peephole_projection
+GeneratedTests.unpack_ex_dynamic_nnfw
+GeneratedTests.zeros_like_ex_2D_float
+GeneratedTests.zeros_like_ex_4D_int32
+GeneratedTests.zeros_like_ex_dynamic_float32
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
new file mode 100644
index 0000000..d443eba
--- /dev/null
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
@@ -0,0 +1,376 @@
+GeneratedTests.abs_
+GeneratedTests.abs_dynamic_nnfw
+GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
+GeneratedTests.argmax_dynamic_nnfw
+GeneratedTests.batch_matmul_ex_dynamic_nnfw
+GeneratedTests.batch_matmul_ex_float_adj_x
+GeneratedTests.batch_matmul_ex_float_adj_y
+GeneratedTests.batch_matmul_ex_float_batch2
+GeneratedTests.batch_matmul_ex_float_broadcast
+GeneratedTests.batch_matmul_ex_float_broadcast2_adj_xy
+GeneratedTests.batch_matmul_ex_float_broadcast_adj_x
+GeneratedTests.batch_matmul_ex_float_simple
+GeneratedTests.broadcast_to_ex_1D_nnfw
+GeneratedTests.broadcast_to_ex_2D_nnfw
+GeneratedTests.broadcast_to_ex_dynamic_2D_nnfw
+GeneratedTests.broadcast_to_ex_dynamic_3D_nnfw
+GeneratedTests.cast_dynamic_float32_to_int32_nnfw
+GeneratedTests.cast_float16_to_float16
+GeneratedTests.cast_float16_to_float32
+GeneratedTests.cast_float16_to_float32_relaxed
+GeneratedTests.cast_float16_to_int32
+GeneratedTests.cast_float16_to_quant8
+GeneratedTests.cast_float16_to_quant8_overflow
+GeneratedTests.cast_float32_to_float16
+GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_int32_to_float16
+GeneratedTests.cast_quant8_to_float16
+GeneratedTests.concat_dynamic_nnfw
+GeneratedTests.conv_dynamic_nnfw
+GeneratedTests.conv_float_channels_weights_as_inputs
+GeneratedTests.conv_float_channels_weights_as_inputs_relaxed
+GeneratedTests.conv_float_large_weights_as_inputs
+GeneratedTests.conv_float_large_weights_as_inputs_relaxed
+GeneratedTests.conv_float_weights_as_inputs
+GeneratedTests.conv_float_weights_as_inputs_relaxed
+GeneratedTests.conv_quant8_channels_weights_as_inputs
+GeneratedTests.conv_quant8_large_weights_as_inputs
+GeneratedTests.conv_quant8_overflow_weights_as_inputs
+GeneratedTests.conv_quant8_weights_as_inputs
+GeneratedTests.conv2d_dilation_nnfw
+GeneratedTests.conv2d_dilation_nnfw_quant8
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_quant8
+GeneratedTests.conv2d_dilation_nnfw_2
+GeneratedTests.conv2d_dilation_nnfw_quant8_2
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_2
+GeneratedTests.conv2d_dilation_nnfw_weight_as_input_quant8_2
+GeneratedTests.cos_ex_1D_float_nnfw
+GeneratedTests.cos_ex_4D_float_nnfw
+GeneratedTests.cos_ex_dynamic_nnfw
+GeneratedTests.dequantize_v1_2_3d_quant8_symm
+GeneratedTests.dequantize_v1_2_4d_quant8_symm
+GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
+GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
+GeneratedTests.dequantize_v1_2
+GeneratedTests.dequantize_v1_2_zero_sized
+GeneratedTests.dequantize_v1_2_zero_sized_float16
+GeneratedTests.div_dynamic_nnfw
+GeneratedTests.einsum_ex_float_matmul_2x2_2
+GeneratedTests.einsum_ex_float_matmul_3x2_3
+GeneratedTests.einsum_ex_float_matmul_3x3_4
+GeneratedTests.einsum_ex_float_matmul_4x4_4
+GeneratedTests.einsum_ex_float_matmul_4x4_4_2
+GeneratedTests.equal_boolean
+GeneratedTests.equal_dynamic_float_nnfw
+GeneratedTests.exp_
+GeneratedTests.exp_2D_float_nnfw
+GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
+GeneratedTests.fill_ex_1D_float
+GeneratedTests.fill_ex_4D_float
+GeneratedTests.fill_ex_dynamic_nnfw
+GeneratedTests.fully_connected_dynamic_nnfw
+GeneratedTests.fully_connected_hybrid_1_nnfw
+GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
+GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
+GeneratedTests.gather_dynamic_nnfw
+GeneratedTests.gather_float16
+GeneratedTests.gather_float16_2
+GeneratedTests.gather_float16_3
+GeneratedTests.gather_float16_4
+GeneratedTests.gather_float16_5
+GeneratedTests.gather_float16_6
+GeneratedTests.gather_float16_7
+GeneratedTests.gather_float16_8
+GeneratedTests.greater_dynamic_float_nnfw
+GeneratedTests.greater_equal_boolean
+GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
+GeneratedTests.less_boolean
+GeneratedTests.less_dynamic_float_nnfw
+GeneratedTests.less_equal_dynamic_float_nnfw
+GeneratedTests.log_4D_float_nnfw
+GeneratedTests.log_dynamic_nnfw
+GeneratedTests.log_softmax_nnfw
+GeneratedTests.log_softmax_nnfw_2
+GeneratedTests.log_softmax_nnfw_3
+GeneratedTests.log_softmax_nnfw_4
+GeneratedTests.log_softmax_nnfw_5
+GeneratedTests.log_softmax_nnfw_quant8
+GeneratedTests.logical_not
+GeneratedTests.logical_not_1D_nnfw
+GeneratedTests.logical_not_4D_nnfw
+GeneratedTests.logical_not_dynamic_nnfw
+GeneratedTests.logical_or_dynamic_nnfw
+GeneratedTests.logistic_dynamic_nnfw
+GeneratedTests.lsh_projection
+GeneratedTests.lsh_projection_2
+GeneratedTests.lsh_projection_weights_as_inputs
+GeneratedTests.lstm
+GeneratedTests.lstm2
+GeneratedTests.lstm2_state
+GeneratedTests.lstm2_state2
+GeneratedTests.lstm3
+GeneratedTests.lstm3_state
+GeneratedTests.lstm3_state2
+GeneratedTests.lstm3_state3
+GeneratedTests.lstm_state
+GeneratedTests.lstm_state2
+GeneratedTests.matrix_band_part_ex_4D_float
+GeneratedTests.matrix_band_part_ex_dynamic_nnfw
+GeneratedTests.maximum_dynamic_nnfw
+GeneratedTests.minimum_dynamic_nnfw
+GeneratedTests.mul_dynamic_nnfw
+GeneratedTests.neg
+GeneratedTests.neg_dynamic_nnfw
+GeneratedTests.not_equal_boolean
+GeneratedTests.not_equal_dynamic_float_nnfw
+GeneratedTests.one_hot_ex_dynamic_nnfw
+GeneratedTests.pack_ex_dynamic_nnfw
+GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
+GeneratedTests.pow_2D_float_nnfw
+GeneratedTests.pow_broadcast_float_nnfw
+GeneratedTests.pow_broadcast_float_nnfw_2
+GeneratedTests.pow_broadcast_float_nnfw_3
+GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
+GeneratedTests.range_ex_float_1
+GeneratedTests.range_ex_float_1_all_constant_inputs
+GeneratedTests.range_ex_float_1_dynamic_nnfw
+GeneratedTests.range_ex_float_2
+GeneratedTests.range_ex_float_2_dynamic_nnfw
+GeneratedTests.reduce_all
+GeneratedTests.reduce_all_2
+GeneratedTests.reduce_all_2D_nnfw
+GeneratedTests.reduce_all_3
+GeneratedTests.reduce_all_4D_nnfw
+GeneratedTests.reduce_all_dynamic_nnfw
+GeneratedTests.reduce_any
+GeneratedTests.reduce_any_2
+GeneratedTests.reduce_any_2D_nnfw
+GeneratedTests.reduce_any_3
+GeneratedTests.reduce_any_4D_nnfw
+GeneratedTests.reduce_max_2D_int32_nnfw
+GeneratedTests.reduce_max_quant8
+GeneratedTests.reduce_mean_dynamic_1_nnfw
+GeneratedTests.reduce_mean_dynamic_2_nnfw
+GeneratedTests.reduce_min_dynamic_nnfw
+GeneratedTests.reduce_prod
+GeneratedTests.reduce_prod_2
+GeneratedTests.reduce_prod_2D_float_nnfw
+GeneratedTests.reduce_prod_3
+GeneratedTests.reduce_prod_4
+GeneratedTests.reduce_prod_4D_float_nnfw
+GeneratedTests.reduce_prod_4D_float_reducing_C_nnfw
+GeneratedTests.reduce_prod_4D_float_reducing_HW_nnfw
+GeneratedTests.reduce_prod_dynamic_1_nnfw
+GeneratedTests.reduce_prod_dynamic_2_nnfw
+GeneratedTests.reduce_sum_dynamic_1_nnfw
+GeneratedTests.reduce_sum_dynamic_2_nnfw
+GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
+GeneratedTests.reverse_ex_1d
+GeneratedTests.reverse_ex_3d
+GeneratedTests.reverse_ex_dynamic_1D
+GeneratedTests.reverse_ex_dynamic_3D
+GeneratedTests.rnn
+GeneratedTests.rnn_state
+GeneratedTests.round_ex_1D_float
+GeneratedTests.round_ex_4D_float
+GeneratedTests.round_ex_dynamic_nnfw
+GeneratedTests.rsqrt
+GeneratedTests.rsqrt_dynamic_nnfw
+GeneratedTests.select_v1_2_five_dim
+GeneratedTests.select_v1_2_five_dim_quant8
+GeneratedTests.select_v1_2_one_dim
+GeneratedTests.select_v1_2_one_dim_quant8
+GeneratedTests.select_v1_2_two_dim
+GeneratedTests.select_v1_2_two_dim_quant8
+GeneratedTests.select_v2_ex_broadcast_1d_single_value
+GeneratedTests.select_v2_ex_broadcast_2d_one
+GeneratedTests.select_v2_ex_broadcast_2d_two
+GeneratedTests.select_v2_ex_broadcast_2d_two_dynamic_nnfw
+GeneratedTests.select_v2_ex_broadcast_less_4d
+GeneratedTests.select_v2_ex_float
+GeneratedTests.shape_ex_dynamic_nnfw
+GeneratedTests.sin_1D_float_nnfw
+GeneratedTests.sin_4D_float_nnfw
+GeneratedTests.sin_dynamic_nnfw
+GeneratedTests.slice
+GeneratedTests.slice_2
+GeneratedTests.slice_3
+GeneratedTests.slice_4
+GeneratedTests.slice_5
+GeneratedTests.slice_6
+GeneratedTests.slice_7
+GeneratedTests.slice_8
+GeneratedTests.slice_dynamic_nnfw
+GeneratedTests.slice_zero_sized
+GeneratedTests.slice_zero_sized_quant8
+GeneratedTests.softmax_dynamic_nnfw
+GeneratedTests.space_to_batch_float_1_nnfw
+GeneratedTests.space_to_batch_float_2
+GeneratedTests.space_to_batch_float_3
+GeneratedTests.space_to_batch_dynamic_float_nnfw
+GeneratedTests.space_to_batch_quant8_1_nnfw
+GeneratedTests.space_to_batch_quant8_2
+GeneratedTests.space_to_batch_quant8_2_nnfw
+GeneratedTests.space_to_batch_quant8_3
+GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
+GeneratedTests.sqrt_
+GeneratedTests.squared_difference_ex_dynamic_nnfw
+GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
+GeneratedTests.strided_slice_dynamic_nnfw
+GeneratedTests.sub_dynamic_nnfw
+GeneratedTests.sub_v1_2_zero_sized
+GeneratedTests.sub_v1_2_zero_sized_quant8
+GeneratedTests.svdf
+GeneratedTests.svdf2
+GeneratedTests.svdf_bias_present
+GeneratedTests.svdf_state
+GeneratedTests.tanh_v1_2
+GeneratedTests.tanh_v1_2_zero_sized
+GeneratedTests.tanh_v1_2_zero_sized_quant8
+GeneratedTests.tanh_v1_dynamic_nnfw
+GeneratedTests.tile_1
+GeneratedTests.tile_1_dynamic_float32_nnfw
+GeneratedTests.tile_1_float16
+GeneratedTests.tile_1_quant8
+GeneratedTests.tile_2
+GeneratedTests.tile_2_dynamic_float32_nnfw
+GeneratedTests.tile_2_float16
+GeneratedTests.tile_2_int32
+GeneratedTests.tile_2_quant8
+GeneratedTests.tile_3
+GeneratedTests.tile_3_dynamic_float32_nnfw
+GeneratedTests.tile_3_float16
+GeneratedTests.tile_3_int32
+GeneratedTests.tile_3_quant8
+GeneratedTests.topk_v2
+GeneratedTests.topk_v2_1D_float_nnfw
+GeneratedTests.topk_v2_1D_int32_nnfw
+GeneratedTests.topk_v2_1D_quant8_nnfw
+GeneratedTests.topk_v2_2
+GeneratedTests.topk_v2_2D_float_nnfw
+GeneratedTests.topk_v2_2D_int32_nnfw
+GeneratedTests.topk_v2_2D_quant8_nnfw
+GeneratedTests.topk_v2_3
+GeneratedTests.topk_v2_4
+GeneratedTests.topk_v2_5
+GeneratedTests.topk_v2_6
+GeneratedTests.transpose_dynamic_nnfw
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
+GeneratedTests.transpose_v1_2_zero_sized
+GeneratedTests.transpose_v1_2_zero_sized_quant8
+GeneratedTests.unidirectional_sequence_lstm_1step
+GeneratedTests.unidirectional_sequence_lstm_batch_major_norm_peephole_projection
+GeneratedTests.unidirectional_sequence_lstm_batch_major_peephole_projection_bias
+GeneratedTests.unidirectional_sequence_lstm_dynamic_nnfw
+GeneratedTests.unidirectional_sequence_lstm_layer_norm_cifg_peephole
+GeneratedTests.unidirectional_sequence_lstm_norm_peephole_projection
+GeneratedTests.unpack_ex_dynamic_nnfw
+GeneratedTests.zeros_like_ex_2D_float
+GeneratedTests.zeros_like_ex_4D_int32
+GeneratedTests.zeros_like_ex_dynamic_float32
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-android.cpu
new file mode 100644
index 0000000..a64ffca
--- /dev/null
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.cpu
@@ -0,0 +1,231 @@
+GeneratedTests.abs_
+GeneratedTests.cast_float16_to_float16
+GeneratedTests.cast_float16_to_float32
+GeneratedTests.cast_float16_to_float32_relaxed
+GeneratedTests.cast_float16_to_int32
+GeneratedTests.cast_float16_to_quant8
+GeneratedTests.cast_float16_to_quant8_overflow
+GeneratedTests.cast_float32_to_float16
+GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_quant8_overflow
+GeneratedTests.cast_float32_to_quant8_overflow_relaxed
+GeneratedTests.cast_int32_to_float16
+GeneratedTests.cast_int32_to_quant8_overflow
+GeneratedTests.cast_quant8_to_float16
+GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
+GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
+GeneratedTests.dequantize_v1_2
+GeneratedTests.dequantize_v1_2_zero_sized
+GeneratedTests.dequantize_v1_2_zero_sized_float16
+GeneratedTests.embedding_lookup
+GeneratedTests.embedding_lookup_2d_nnfw
+GeneratedTests.embedding_lookup_4d_nnfw
+GeneratedTests.equal_broadcast_float_nnfw
+GeneratedTests.exp_
+GeneratedTests.floor_
+GeneratedTests.gather_float16
+GeneratedTests.gather_float16_2
+GeneratedTests.gather_float16_3
+GeneratedTests.gather_float16_4
+GeneratedTests.gather_float16_5
+GeneratedTests.gather_float16_6
+GeneratedTests.gather_float16_7
+GeneratedTests.gather_float16_8
+GeneratedTests.hashtable_lookup_float
+GeneratedTests.hashtable_lookup_float_4D_nnfw
+GeneratedTests.hashtable_lookup_quant8
+GeneratedTests.l2_pool_float
+GeneratedTests.l2_pool_float_2
+GeneratedTests.l2_pool_float_large
+GeneratedTests.local_response_norm_float_1
+GeneratedTests.local_response_norm_float_2
+GeneratedTests.local_response_norm_float_3
+GeneratedTests.local_response_norm_float_4
+GeneratedTests.logical_not
+GeneratedTests.lsh_projection
+GeneratedTests.lsh_projection_2
+GeneratedTests.lsh_projection_weights_as_inputs
+GeneratedTests.lstm2
+GeneratedTests.lstm2_state
+GeneratedTests.lstm2_state2
+GeneratedTests.maximum_broadcast_quant8
+GeneratedTests.maximum_overflow
+GeneratedTests.maximum_simple_quant8
+GeneratedTests.minimum_broadcast_quant8
+GeneratedTests.minimum_overflow
+GeneratedTests.minimum_simple_quant8
+GeneratedTests.neg
+GeneratedTests.neg_3D_int_nnfw
+GeneratedTests.neg_4D_int_nnfw
+GeneratedTests.prelu
+GeneratedTests.prelu_broadcast_float_1_nnfw
+GeneratedTests.prelu_broadcast_quant8_1_nnfw
+GeneratedTests.prelu_float_1_nnfw
+GeneratedTests.prelu_quant8
+GeneratedTests.prelu_quant8_1_nnfw
+GeneratedTests.prelu_quant8_2
+GeneratedTests.prelu_quant8_3
+GeneratedTests.prelu_quant8_4
+GeneratedTests.prelu_weight_as_input
+GeneratedTests.prelu_weight_as_input_quant8
+GeneratedTests.prelu_weight_as_input_quant8_2
+GeneratedTests.prelu_weight_as_input_quant8_3
+GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
+GeneratedTests.reduce_max_quant8
+GeneratedTests.reduce_max_quant8_1_nnfw
+GeneratedTests.reduce_max_quant8_2
+GeneratedTests.reduce_max_quant8_2_nnfw
+GeneratedTests.reduce_max_quant8_3
+GeneratedTests.reduce_max_quant8_4
+GeneratedTests.reduce_min_quant8
+GeneratedTests.reduce_min_quant8_2
+GeneratedTests.reduce_min_quant8_3
+GeneratedTests.reduce_min_quant8_4
+GeneratedTests.relu1_float_1
+GeneratedTests.relu1_float_2
+GeneratedTests.relu1_quant8_1
+GeneratedTests.relu1_quant8_2
+GeneratedTests.relu6_quant8_1
+GeneratedTests.relu6_quant8_2
+GeneratedTests.relu_quant8_1
+GeneratedTests.relu_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
+GeneratedTests.rnn
+GeneratedTests.rnn_state
+GeneratedTests.rsqrt
+GeneratedTests.select_v1_2_five_dim
+GeneratedTests.select_v1_2_five_dim_quant8
+GeneratedTests.select_v1_2_one_dim_quant8
+GeneratedTests.select_v1_2_two_dim_quant8
+GeneratedTests.slice_5
+GeneratedTests.slice_6
+GeneratedTests.slice_8
+GeneratedTests.slice_zero_sized
+GeneratedTests.slice_zero_sized_quant8
+GeneratedTests.sqrt_
+GeneratedTests.sqrt_1D_float_nnfw
+GeneratedTests.sqrt_2D_float_nnfw
+GeneratedTests.sqrt_3D_float_nnfw
+GeneratedTests.sqrt_4D_float_nnfw
+GeneratedTests.strided_slice_qaunt8_10
+GeneratedTests.strided_slice_qaunt8_11
+GeneratedTests.strided_slice_quant8_1
+GeneratedTests.strided_slice_quant8_2
+GeneratedTests.strided_slice_quant8_3
+GeneratedTests.strided_slice_quant8_4
+GeneratedTests.strided_slice_quant8_5
+GeneratedTests.strided_slice_quant8_6
+GeneratedTests.strided_slice_quant8_7
+GeneratedTests.strided_slice_quant8_8
+GeneratedTests.strided_slice_quant8_9
+GeneratedTests.sub_v1_2_zero_sized
+GeneratedTests.sub_v1_2_zero_sized_quant8
+GeneratedTests.svdf
+GeneratedTests.svdf2
+GeneratedTests.svdf_bias_present
+GeneratedTests.svdf_state
+GeneratedTests.tanh_v1_2
+GeneratedTests.tanh_v1_2_zero_sized
+GeneratedTests.tanh_v1_2_zero_sized_quant8
+GeneratedTests.tile_1_float16
+GeneratedTests.tile_1_quant8
+GeneratedTests.tile_2_float16
+GeneratedTests.tile_2_int32
+GeneratedTests.tile_2_quant8
+GeneratedTests.tile_3_float16
+GeneratedTests.tile_3_int32
+GeneratedTests.tile_3_quant8
+GeneratedTests.topk_v2
+GeneratedTests.topk_v2_1D_float_nnfw
+GeneratedTests.topk_v2_1D_int32_nnfw
+GeneratedTests.topk_v2_1D_quant8_nnfw
+GeneratedTests.topk_v2_2
+GeneratedTests.topk_v2_2D_float_nnfw
+GeneratedTests.topk_v2_2D_int32_nnfw
+GeneratedTests.topk_v2_2D_quant8_nnfw
+GeneratedTests.topk_v2_3
+GeneratedTests.topk_v2_4
+GeneratedTests.topk_v2_5
+GeneratedTests.topk_v2_6
+GeneratedTests.transpose_conv_ex_float_1
+GeneratedTests.transpose_conv_ex_float_2
+GeneratedTests.transpose_conv_ex_float_3
+GeneratedTests.transpose_conv_ex_float_4
+GeneratedTests.transpose_v1_2_zero_sized
+GeneratedTests.transpose_v1_2_zero_sized_quant8
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
index 8d5428a..a64ffca 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
@@ -12,11 +12,6 @@ GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
 GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
-GeneratedTests.depth_to_space_float_1
-GeneratedTests.depth_to_space_float_2
-GeneratedTests.depth_to_space_float_3
-GeneratedTests.depth_to_space_quant8_1
-GeneratedTests.depth_to_space_quant8_2
 GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
 GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
 GeneratedTests.dequantize_v1_2
@@ -46,14 +41,6 @@ GeneratedTests.local_response_norm_float_1
 GeneratedTests.local_response_norm_float_2
 GeneratedTests.local_response_norm_float_3
 GeneratedTests.local_response_norm_float_4
-GeneratedTests.logical_and_1D_nnfw
-GeneratedTests.logical_and_2D_nnfw
-GeneratedTests.logical_and_3D_nnfw
-GeneratedTests.logical_and_4D_nnfw
-GeneratedTests.logical_and_broadcast
-GeneratedTests.logical_and_broadcast_4D_2D_nnfw
-GeneratedTests.logical_and_broadcast_nnfw
-GeneratedTests.logical_and_simple
 GeneratedTests.logical_not
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
index 8d5428a..a64ffca 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
@@ -12,11 +12,6 @@ GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
 GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
-GeneratedTests.depth_to_space_float_1
-GeneratedTests.depth_to_space_float_2
-GeneratedTests.depth_to_space_float_3
-GeneratedTests.depth_to_space_quant8_1
-GeneratedTests.depth_to_space_quant8_2
 GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
 GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
 GeneratedTests.dequantize_v1_2
@@ -46,14 +41,6 @@ GeneratedTests.local_response_norm_float_1
 GeneratedTests.local_response_norm_float_2
 GeneratedTests.local_response_norm_float_3
 GeneratedTests.local_response_norm_float_4
-GeneratedTests.logical_and_1D_nnfw
-GeneratedTests.logical_and_2D_nnfw
-GeneratedTests.logical_and_3D_nnfw
-GeneratedTests.logical_and_4D_nnfw
-GeneratedTests.logical_and_broadcast
-GeneratedTests.logical_and_broadcast_4D_2D_nnfw
-GeneratedTests.logical_and_broadcast_nnfw
-GeneratedTests.logical_and_simple
 GeneratedTests.logical_not
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
index ba14120..e0ed8d7 100644
--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
+++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
@@ -23,6 +23,12 @@ GeneratedTests.argmax_neg_axis_float_nnfw
 GeneratedTests.argmax_neg_axis_int32_nnfw
 GeneratedTests.argmax_quant8_neg_axis_nnfw
 GeneratedTests.argmax_quant8_nnfw
+GeneratedTests.argmin_1
+GeneratedTests.argmin_1_quant8
+GeneratedTests.argmin_2
+GeneratedTests.argmin_2_quant8
+GeneratedTests.argmin_3
+GeneratedTests.argmin_3_quant8
 GeneratedTests.avg_pool_quant8_1
 GeneratedTests.avg_pool_quant8_2
 GeneratedTests.avg_pool_quant8_3
diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
index cb0d07c..cad0729 100644
--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
@@ -12,11 +12,6 @@ GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
 GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
-GeneratedTests.depth_to_space_float_1
-GeneratedTests.depth_to_space_float_2
-GeneratedTests.depth_to_space_float_3
-GeneratedTests.depth_to_space_quant8_1
-GeneratedTests.depth_to_space_quant8_2
 GeneratedTests.dequantize_v1_2_3d_per_channel_first_dim
 GeneratedTests.dequantize_v1_2_3d_per_channel_second_dim
 GeneratedTests.dequantize_v1_2
@@ -46,14 +41,6 @@ GeneratedTests.local_response_norm_float_1
 GeneratedTests.local_response_norm_float_2
 GeneratedTests.local_response_norm_float_3
 GeneratedTests.local_response_norm_float_4
-GeneratedTests.logical_and_1D_nnfw
-GeneratedTests.logical_and_2D_nnfw
-GeneratedTests.logical_and_3D_nnfw
-GeneratedTests.logical_and_4D_nnfw
-GeneratedTests.logical_and_broadcast
-GeneratedTests.logical_and_broadcast_4D_2D_nnfw
-GeneratedTests.logical_and_broadcast_nnfw
-GeneratedTests.logical_and_simple
 GeneratedTests.logical_not
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
diff --git a/tests/nnapi/specs/skip/V1_2/argmin_1.mod.py b/tests/nnapi/specs/V1_2/argmin_1.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/argmin_1.mod.py
rename to tests/nnapi/specs/V1_2/argmin_1.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/argmin_2.mod.py b/tests/nnapi/specs/V1_2/argmin_2.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/argmin_2.mod.py
rename to tests/nnapi/specs/V1_2/argmin_2.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/argmin_3.mod.py b/tests/nnapi/specs/V1_2/argmin_3.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/argmin_3.mod.py
rename to tests/nnapi/specs/V1_2/argmin_3.mod.py
diff --git a/tests/nnapi/src/TestGenerated.cpp b/tests/nnapi/src/TestGenerated.cpp
index 2347353..093e5a9 100644
--- a/tests/nnapi/src/TestGenerated.cpp
+++ b/tests/nnapi/src/TestGenerated.cpp
@@ -256,8 +256,11 @@ void GeneratedTests::SetUp() {
     mOldComputeMode = Execution::setComputeMode(GetParam());
 #endif
     // Fix for onert: Fix file path for linux
+#ifndef __ANDROID__
     char cacheDirTemp[] = "/tmp/TestCompilationCachingXXXXXX";
-    //char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX";
+#else
+    char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX";
+#endif
     char* cacheDir = mkdtemp(cacheDirTemp);
     ASSERT_NE(cacheDir, nullptr);
     mCacheDir = cacheDir;
diff --git a/tests/nnapi/src/TestValidation.cpp b/tests/nnapi/src/TestValidation.cpp
index 45432c0..3e749b8 100644
--- a/tests/nnapi/src/TestValidation.cpp
+++ b/tests/nnapi/src/TestValidation.cpp
@@ -29,13 +29,19 @@
 // This file tests all the validations done by the Neural Networks API.
 namespace {
 
+#ifndef PATH_MAX
 #define PATH_MAX 256
+#endif
 
 static int shmem_num = 0;
 static int shmem_create_region(size_t size)
 {
     char temp[PATH_MAX];
+#ifndef __ANDROID__
     snprintf(temp, sizeof(temp), "/tmp/nn-shmem-%d-%d-XXXXXXXXX", getpid(), shmem_num++);
+#else
+    snprintf(temp, sizeof(temp), "/data/local/tmp/nn-shmem-%d-%d-XXXXXXXXX", getpid(), shmem_num++);
+#endif
 
     // Set umask and recover after generate temporary file to avoid security issue
     mode_t umaskPrev = umask(S_IRUSR|S_IWUSR);
diff --git a/tests/nnfw_api/CMakeLists.txt b/tests/nnfw_api/CMakeLists.txt
index aa3a942..40142dd 100644
--- a/tests/nnfw_api/CMakeLists.txt
+++ b/tests/nnfw_api/CMakeLists.txt
@@ -19,6 +19,11 @@ if(ARMCompute_FOUND)
   target_compile_definitions(${RUNTIME_NNFW_API_TEST} PRIVATE TEST_ACL_BACKEND)
 endif(ARMCompute_FOUND)
 
+nnfw_find_package(Xnnpack QUIET)
+if(Xnnpack_FOUND)
+  target_compile_definitions(${RUNTIME_NNFW_API_TEST} PRIVATE TEST_XNNPACK_BACKEND)
+endif(Xnnpack_FOUND)
+
 set(RUNTIME_NNFW_API_TEST_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/include
                                   ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(${RUNTIME_NNFW_API_TEST} PRIVATE ${RUNTIME_NNFW_API_TEST_INCLUDE})
diff --git a/tests/nnfw_api/src/CircleGen.cc b/tests/nnfw_api/src/CircleGen.cc
index 1dd3f9b..87b38f2 100644
--- a/tests/nnfw_api/src/CircleGen.cc
+++ b/tests/nnfw_api/src/CircleGen.cc
@@ -78,7 +78,7 @@ CircleBuffer CircleGen::finish()
   for (auto &ctx : _subgraph_contexts)
     subgraphs.push_back(buildSubGraph(ctx));
   auto model =
-      circle::CreateModelDirect(_fbb, 3, &_opcodes, &subgraphs, "CircleGen generated", &_buffers);
+    circle::CreateModelDirect(_fbb, 3, &_opcodes, &subgraphs, "CircleGen generated", &_buffers);
   _fbb.Finish(model);
   return CircleBuffer{std::move(_fbb)};
 }
@@ -107,13 +107,20 @@ uint32_t CircleGen::addOperatorArgMax(const OperatorParams &params, circle::Tens
                                 circle::BuiltinOptions_ArgMaxOptions, options);
 }
 
+uint32_t CircleGen::addOperatorArgMin(const OperatorParams &params, circle::TensorType output_type)
+{
+  auto options = circle::CreateArgMaxOptions(_fbb, output_type).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_ARG_MIN,
+                                circle::BuiltinOptions_ArgMinOptions, options);
+}
+
 uint32_t CircleGen::addOperatorAveragePool2D(const OperatorParams &params, circle::Padding padding,
                                              int stride_w, int stride_h, int filter_w, int filter_h,
                                              circle::ActivationFunctionType actfn)
 {
   auto options =
-      circle::CreatePool2DOptions(_fbb, padding, stride_w, stride_h, filter_w, filter_h, actfn)
-          .Union();
+    circle::CreatePool2DOptions(_fbb, padding, stride_w, stride_h, filter_w, filter_h, actfn)
+      .Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_AVERAGE_POOL_2D,
                                 circle::BuiltinOptions_Pool2DOptions, options);
 }
@@ -134,6 +141,18 @@ uint32_t CircleGen::addOperatorConcatenation(const OperatorParams &params, int a
                                 circle::BuiltinOptions_ConcatenationOptions, options);
 }
 
+uint32_t CircleGen::addOperatorConv2D(const OperatorParams &params, circle::Padding padding,
+                                      int stride_w, int stride_h,
+                                      circle::ActivationFunctionType actfn, int dilation_w,
+                                      int dilation_h)
+{
+  auto options =
+    circle::CreateConv2DOptions(_fbb, padding, stride_w, stride_h, actfn, dilation_w, dilation_h)
+      .Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_CONV_2D,
+                                circle::BuiltinOptions_Conv2DOptions, options);
+}
+
 uint32_t CircleGen::addOperatorCos(const OperatorParams &params)
 {
   auto options = circle::CreateCosOptions(_fbb).Union();
@@ -141,6 +160,13 @@ uint32_t CircleGen::addOperatorCos(const OperatorParams &params)
                                 circle::BuiltinOptions_CosOptions, options);
 }
 
+uint32_t CircleGen::addOperatorDepthToSpace(const OperatorParams &params, int32_t block_size)
+{
+  auto options = circle::CreateDepthToSpaceOptions(_fbb, block_size).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_DEPTH_TO_SPACE,
+                                circle::BuiltinOptions_DepthToSpaceOptions, options);
+}
+
 uint32_t CircleGen::addOperatorDepthwiseConv2D(const OperatorParams &params,
                                                circle::Padding padding, int stride_w, int stride_h,
                                                int depth_multiplier,
@@ -148,13 +174,19 @@ uint32_t CircleGen::addOperatorDepthwiseConv2D(const OperatorParams &params,
                                                int dilation_h)
 {
   auto options =
-      circle::CreateDepthwiseConv2DOptions(_fbb, padding, stride_w, stride_h, depth_multiplier,
-                                           actfn, dilation_w, dilation_h)
-          .Union();
+    circle::CreateDepthwiseConv2DOptions(_fbb, padding, stride_w, stride_h, depth_multiplier, actfn,
+                                         dilation_w, dilation_h)
+      .Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_DEPTHWISE_CONV_2D,
                                 circle::BuiltinOptions_DepthwiseConv2DOptions, options);
 }
 
+uint32_t CircleGen::addOperatorElu(const OperatorParams &params)
+{
+  return addOperatorWithOptions(params, circle::BuiltinOperator_ELU, circle::BuiltinOptions_NONE,
+                                0);
+}
+
 uint32_t CircleGen::addOperatorEqual(const OperatorParams &params)
 {
   auto options = circle::CreateEqualOptions(_fbb).Union();
@@ -162,13 +194,20 @@ uint32_t CircleGen::addOperatorEqual(const OperatorParams &params)
                                 circle::BuiltinOptions_EqualOptions, options);
 }
 
+uint32_t CircleGen::addOperatorExpandDims(const OperatorParams &params)
+{
+  auto options = circle::CreateEqualOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_EXPAND_DIMS,
+                                circle::BuiltinOptions_ExpandDimsOptions, options);
+}
+
 uint32_t
 CircleGen::addOperatorFullyConnected(const OperatorParams &params,
                                      circle::FullyConnectedOptionsWeightsFormat weights_format)
 {
   auto options =
-      circle::CreateFullyConnectedOptions(_fbb, circle::ActivationFunctionType_NONE, weights_format)
-          .Union();
+    circle::CreateFullyConnectedOptions(_fbb, circle::ActivationFunctionType_NONE, weights_format)
+      .Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_FULLY_CONNECTED,
                                 circle::BuiltinOptions_FullyConnectedOptions, options);
 }
@@ -214,6 +253,13 @@ uint32_t CircleGen::addOperatorLogSoftmax(const OperatorParams &params)
                                 circle::BuiltinOptions_LogSoftmaxOptions, options);
 }
 
+uint32_t CircleGen::addOperatorMean(const OperatorParams &params, bool keep_dims)
+{
+  auto options = circle::CreateReducerOptions(_fbb, keep_dims).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_MEAN,
+                                circle::BuiltinOptions_ReducerOptions, options);
+}
+
 uint32_t CircleGen::addOperatorNeg(const OperatorParams &params)
 {
   auto options = circle::CreatePadOptions(_fbb).Union();
@@ -277,7 +323,7 @@ uint32_t CircleGen::addOperatorResizeBilinear(const OperatorParams &params, bool
                                               bool half_pixel_centers)
 {
   auto options =
-      circle::CreateResizeBilinearOptions(_fbb, align_corners, half_pixel_centers).Union();
+    circle::CreateResizeBilinearOptions(_fbb, align_corners, half_pixel_centers).Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_RESIZE_BILINEAR,
                                 circle::BuiltinOptions_ResizeBilinearOptions, options);
 }
@@ -329,7 +375,7 @@ uint32_t CircleGen::addOperatorStridedSlice(const OperatorParams &params, int32_
 {
   auto options = circle::CreateStridedSliceOptions(_fbb, begin_mask, end_mask, ellipsis_mask,
                                                    new_axis_mask, shrink_axis_mask)
-                     .Union();
+                   .Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_STRIDED_SLICE,
                                 circle::BuiltinOptions_StridedSliceOptions, options);
 }
@@ -371,6 +417,19 @@ uint32_t CircleGen::addOperatorTranspose(const OperatorParams &params)
                                 circle::BuiltinOptions_TransposeOptions, options);
 }
 
+uint32_t CircleGen::addOperatorSqrt(const OperatorParams &params)
+{
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SQRT, circle::BuiltinOptions_NONE,
+                                0);
+}
+
+uint32_t CircleGen::addOperatorSquare(const OperatorParams &params)
+{
+  auto options = circle::CreateSquareOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SQUARE,
+                                circle::BuiltinOptions_SquareOptions, options);
+}
+
 // NOTE Please add addOperator functions ABOVE this lie
 //
 // %  How to add a new addOperatorXXX fuction
@@ -379,6 +438,9 @@ uint32_t CircleGen::addOperatorTranspose(const OperatorParams &params)
 // 2. Change enum BuiltinOperator
 // 3. Change enum BuiltinOptions
 // 4. Change CreateXXXOptions accordingly
+//
+// If operator don't have option table, remove CreateXXXOptions call,
+// call addOperatorWithOptions with options_type = circle::BuiltinOptions_NONE and options = 0
 
 // ===== Add Operator methods end =====
 
@@ -440,7 +502,7 @@ CircleGen::buildSparsityParameters(const SparsityParams &sp)
   flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order;
   flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map;
   flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
-      dim_metadata;
+    dim_metadata;
 
   traversal_order = _fbb.CreateVector(sp.traversal_order);
   block_map = _fbb.CreateVector(sp.block_map);
@@ -451,8 +513,8 @@ CircleGen::buildSparsityParameters(const SparsityParams &sp)
     auto fb_array_segments = circle::CreateUint16VectorDirect(_fbb, &it._array_segments.u16);
     auto fb_array_indices = circle::CreateUint16VectorDirect(_fbb, &it._array_indices.u16);
     auto dim_metadata = circle::CreateDimensionMetadata(
-        _fbb, it._format, it._dense_size, it._array_segments_type, fb_array_segments.Union(),
-        it._array_indices_type, fb_array_indices.Union());
+      _fbb, it._format, it._dense_size, it._array_segments_type, fb_array_segments.Union(),
+      it._array_indices_type, fb_array_indices.Union());
     dim_metadata_vec.emplace_back(dim_metadata);
   }
   dim_metadata = _fbb.CreateVector(dim_metadata_vec);
diff --git a/tests/nnfw_api/src/CircleGen.h b/tests/nnfw_api/src/CircleGen.h
index 7da2459..6662183 100644
--- a/tests/nnfw_api/src/CircleGen.h
+++ b/tests/nnfw_api/src/CircleGen.h
@@ -67,15 +67,15 @@ public:
     DimMetaData() = delete;
     DimMetaData(SparseDimensionType format, std::vector<uint16_t> array_segments,
                 std::vector<uint16_t> array_indices)
-        : _format{format},
-          _array_segments_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector),
-          _array_indices_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector)
+      : _format{format},
+        _array_segments_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector),
+        _array_indices_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector)
     {
       _array_segments.u16 = array_segments;
       _array_indices.u16 = array_indices;
     }
     DimMetaData(SparseDimensionType format, int32_t dense_size)
-        : _format{format}, _dense_size{dense_size}
+      : _format{format}, _dense_size{dense_size}
     {
     }
     SparseDimensionType _format{circle::DimensionType_DENSE};
@@ -139,6 +139,8 @@ public:
   uint32_t addOperatorAddN(const OperatorParams &params);
   uint32_t addOperatorArgMax(const OperatorParams &params,
                              circle::TensorType output_type = circle::TensorType::TensorType_INT32);
+  uint32_t addOperatorArgMin(const OperatorParams &params,
+                             circle::TensorType output_type = circle::TensorType::TensorType_INT32);
   uint32_t addOperatorAveragePool2D(const OperatorParams &params, circle::Padding padding,
                                     int stride_w, int stride_h, int filter_w, int filter_h,
                                     circle::ActivationFunctionType actfn);
@@ -146,17 +148,23 @@ public:
                            circle::TensorType output_type);
   uint32_t addOperatorConcatenation(const OperatorParams &params, int axis,
                                     circle::ActivationFunctionType actfn);
+  uint32_t addOperatorConv2D(const OperatorParams &params, circle::Padding padding, int stride_w,
+                             int stride_h, circle::ActivationFunctionType actfn, int dilation_w = 1,
+                             int dilation_h = 1);
   uint32_t addOperatorCos(const OperatorParams &params);
+  uint32_t addOperatorDepthToSpace(const OperatorParams &params, int32_t block_size);
   uint32_t addOperatorDepthwiseConv2D(const OperatorParams &params, circle::Padding padding,
                                       int stride_w, int stride_h, int depth_multiplier,
                                       circle::ActivationFunctionType actfn, int dilation_w = 1,
                                       int dilation_h = 1);
+  uint32_t addOperatorElu(const OperatorParams &params);
   uint32_t addOperatorEqual(const OperatorParams &params);
+  uint32_t addOperatorExpandDims(const OperatorParams &params);
   uint32_t addOperatorFill(const OperatorParams &params);
   uint32_t addOperatorFloor(const OperatorParams &params);
   uint32_t addOperatorFullyConnected(const OperatorParams &params,
                                      circle::FullyConnectedOptionsWeightsFormat weights_format =
-                                         circle::FullyConnectedOptionsWeightsFormat_DEFAULT);
+                                       circle::FullyConnectedOptionsWeightsFormat_DEFAULT);
   uint32_t addOperatorIf(const OperatorParams &params, uint32_t then_subg, uint32_t else_subg);
   uint32_t addOperatorInstanceNorm(const OperatorParams &params, float epsilon,
                                    circle::ActivationFunctionType actfn);
@@ -164,6 +172,7 @@ public:
   uint32_t addOperatorLeakyRelu(const OperatorParams &params, float alpha);
   uint32_t addOperatorLess(const OperatorParams &params);
   uint32_t addOperatorLogSoftmax(const OperatorParams &params);
+  uint32_t addOperatorMean(const OperatorParams &params, bool keep_dims);
   uint32_t addOperatorNeg(const OperatorParams &params);
   uint32_t addOperatorOneHot(const OperatorParams &params, int32_t axis);
   uint32_t addOperatorPad(const OperatorParams &params);
@@ -185,6 +194,8 @@ public:
   uint32_t addOperatorSelect(const OperatorParams &params);
   uint32_t addOperatorSelectV2(const OperatorParams &params);
   uint32_t addOperatorSplit(const OperatorParams &params, int32_t num_split);
+  uint32_t addOperatorSqrt(const OperatorParams &params);
+  uint32_t addOperatorSquare(const OperatorParams &params);
   uint32_t addOperatorStridedSlice(const OperatorParams &params, int32_t begin_mask = 0,
                                    int32_t end_mask = 0, int32_t ellipsis_mask = 0,
                                    int32_t new_axis_mask = 0, int32_t shrink_axis_mask = 0);
diff --git a/tests/nnfw_api/src/GenModelTest.h b/tests/nnfw_api/src/GenModelTest.h
index 144c379..3583ce0 100644
--- a/tests/nnfw_api/src/GenModelTest.h
+++ b/tests/nnfw_api/src/GenModelTest.h
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __NNFW_API_TEST_GEN_MODEL_TEST_H__
+#define __NNFW_API_TEST_GEN_MODEL_TEST_H__
+
 #include <gtest/gtest.h>
 #include <nnfw_internal.h>
 
@@ -224,10 +227,16 @@ public:
         _backends.push_back(backend);
       }
 #endif
-      if (backend == "cpu")
+      if (backend == "cpu" || backend == "ruy")
       {
         _backends.push_back(backend);
       }
+#ifdef TEST_XNNPACK_BACKEND
+      if (backend == "xnnpack")
+      {
+        _backends.push_back(backend);
+      }
+#endif
     }
   }
 
@@ -241,6 +250,11 @@ public:
    */
   void expectFailCompile() { _expected_fail_compile = true; }
 
+  /**
+   * @brief Expect failure while execution
+   */
+  void expectFailExecution() { _expected_fail_execution = true; }
+
 private:
   CircleBuffer _cbuf;
   std::vector<TestCaseData> _test_cases;
@@ -248,6 +262,7 @@ private:
   std::unordered_map<uint32_t, size_t> _output_sizes;
   bool _expected_fail_model_load{false};
   bool _expected_fail_compile{false};
+  bool _expected_fail_execution{false};
 };
 
 /**
@@ -277,7 +292,7 @@ protected:
       NNFW_ENSURE_SUCCESS(nnfw_create_session(&_so.session));
       auto &cbuf = _context->cbuf();
       auto model_load_result =
-          nnfw_load_circle_from_buffer(_so.session, cbuf.buffer(), cbuf.size());
+        nnfw_load_circle_from_buffer(_so.session, cbuf.buffer(), cbuf.size());
       if (_context->expected_fail_model_load())
       {
         ASSERT_NE(model_load_result, NNFW_STATUS_NO_ERROR);
@@ -290,7 +305,7 @@ protected:
 
       if (_context->expected_fail_compile())
       {
-        ASSERT_EQ(nnfw_prepare(_so.session), NNFW_STATUS_ERROR);
+        ASSERT_NE(nnfw_prepare(_so.session), NNFW_STATUS_NO_ERROR);
 
         NNFW_ENSURE_SUCCESS(nnfw_close_session(_so.session));
         continue;
@@ -362,7 +377,7 @@ protected:
 
         if (test_case.expected_fail_run())
         {
-          ASSERT_EQ(nnfw_run(_so.session), NNFW_STATUS_ERROR);
+          ASSERT_NE(nnfw_run(_so.session), NNFW_STATUS_NO_ERROR);
           continue;
         }
 
@@ -447,3 +462,5 @@ protected:
   SessionObjectGeneric _so;
   std::unique_ptr<GenModelTestContext> _context;
 };
+
+#endif // __NNFW_API_TEST_GEN_MODEL_TEST_H__
diff --git a/tests/nnfw_api/src/ModelTestDynamicTensor.cc b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
index 459c2e8..1ed8f95 100644
--- a/tests/nnfw_api/src/ModelTestDynamicTensor.cc
+++ b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
@@ -21,6 +21,7 @@
 #include "fixtures.h"
 #include "CircleGen.h"
 #include "GenModelTest.h"
+#include "NNPackages.h"
 
 // This macro can be used instead of using NNFW_ENSURE_SUCCESS especially with negative test.
 // E.g., setInputOutput() is written with this macro and the following check is available to check
diff --git a/tests/nnfw_api/src/NNPackages.cc b/tests/nnfw_api/src/NNPackages.cc
index d9b2526..11e0c8e 100644
--- a/tests/nnfw_api/src/NNPackages.cc
+++ b/tests/nnfw_api/src/NNPackages.cc
@@ -25,11 +25,14 @@
 
 // NOTE Must match `enum TestPackages`
 const char *TEST_PACKAGE_NAMES[] = {
-    // for validation test
-    "add", "add_no_manifest", "add_invalid_manifest",
+  // for validation test
+  "add",
+  "add_no_manifest",
+  "add_invalid_manifest",
 
-    // for dynamic tensor test
-    "while_dynamic", "if_dynamic",
+  // for dynamic tensor test
+  "while_dynamic",
+  "if_dynamic",
 };
 
 NNPackages &NNPackages::get()
@@ -43,11 +46,11 @@ void NNPackages::init(const char *argv0)
   char raw_dir[1024];
   char cwd[1024];
   strncpy(raw_dir, argv0, sizeof(raw_dir) - 1);
-  dirname(raw_dir);
-  if (raw_dir[0] == '/')
+  char *dir_path = dirname(raw_dir);
+  if (dir_path[0] == '/')
   {
     // If it is an absolute path, just use it
-    _base_path = raw_dir;
+    _base_path = dir_path;
   }
   else
   {
@@ -55,7 +58,7 @@ void NNPackages::init(const char *argv0)
     getcwd(cwd, sizeof(cwd));
     _base_path = cwd;
     _base_path += "/";
-    _base_path += raw_dir;
+    _base_path += dir_path;
   }
 }
 
diff --git a/tests/nnfw_api/src/RegressionTests.cc b/tests/nnfw_api/src/RegressionTests.cc
index 10d6e5d..de23339 100644
--- a/tests/nnfw_api/src/RegressionTests.cc
+++ b/tests/nnfw_api/src/RegressionTests.cc
@@ -116,11 +116,11 @@ TEST_F(RegressionTest, github_11748)
 
     uint8_t input_buf[new_dim * sizeof(float)];
     NNFW_ENSURE_SUCCESS(
-        nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
+      nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
 
     uint8_t output_buf[new_dim * sizeof(float)];
     NNFW_ENSURE_SUCCESS(
-        nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
+      nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
 
     NNFW_ENSURE_SUCCESS(nnfw_run(session));
 
@@ -134,9 +134,9 @@ TEST_F(RegressionTest, github_11748)
     // seems weird calling but anyway nnstreamer people case calls this again.
     // Anyways, runtime should work
     NNFW_ENSURE_SUCCESS(
-        nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
+      nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
     NNFW_ENSURE_SUCCESS(
-        nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
+      nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
     NNFW_ENSURE_SUCCESS(nnfw_run(session));
   }
 
@@ -166,9 +166,9 @@ TEST_F(RegressionTest, github_4585)
   std::vector<float> out_buf{-1, -1};
 
   NNFW_ENSURE_SUCCESS(
-      nnfw_set_input(session, 0, ti_new.dtype, in_buf.data(), in_buf.size() * sizeof(float)));
+    nnfw_set_input(session, 0, ti_new.dtype, in_buf.data(), in_buf.size() * sizeof(float)));
   NNFW_ENSURE_SUCCESS(
-      nnfw_set_output(session, 0, ti_new.dtype, out_buf.data(), out_buf.size() * sizeof(float)));
+    nnfw_set_output(session, 0, ti_new.dtype, out_buf.data(), out_buf.size() * sizeof(float)));
 
   NNFW_ENSURE_SUCCESS(nnfw_run(session));
 
diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
index fbe7214..5fbb844 100644
--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
 {
   // load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
-                _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+              _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
             NNFW_STATUS_INVALID_STATE);
 }
 
diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
index 02bbd0e..d668a1c 100644
--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
@@ -173,7 +173,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
 {
   // Load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
-                _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+              _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
             NNFW_STATUS_INVALID_STATE);
 }
 
diff --git a/tests/nnfw_api/src/ValidationTestMultipleSessions.cc b/tests/nnfw_api/src/ValidationTestMultipleSessions.cc
new file mode 100644
index 0000000..758e1db
--- /dev/null
+++ b/tests/nnfw_api/src/ValidationTestMultipleSessions.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fixtures.h"
+
+TEST_F(ValidationTestTwoSessions, neg_two_sessions_create)
+{
+  ASSERT_EQ(nnfw_create_session(&_session1), NNFW_STATUS_NO_ERROR);
+  ASSERT_EQ(nnfw_create_session(nullptr), NNFW_STATUS_UNEXPECTED_NULL);
+
+  ASSERT_EQ(nnfw_close_session(_session1), NNFW_STATUS_NO_ERROR);
+}
+
+class AveragePoolModel
+{
+public:
+  AveragePoolModel(int N, int H, int W, int C)
+  {
+    CircleGen cgen;
+    int in = cgen.addTensor({{N, H, W, C}, circle::TensorType::TensorType_FLOAT32});
+    int out = cgen.addTensor({{N, H / 2, W / 2, C}, circle::TensorType::TensorType_FLOAT32});
+    cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, 2, 2,
+                                  circle::ActivationFunctionType_NONE);
+    cgen.setInputsAndOutputs({in}, {out});
+    cbuf = cgen.finish();
+  };
+
+  CircleBuffer cbuf;
+};
+
+TEST_F(ValidationTestTwoSessionsCreated, two_sessions_run_simple_model)
+{
+  constexpr int N = 64, H = 64, W = 64, C = 3;
+  AveragePoolModel model(N, H, W, C);
+
+  NNFW_ENSURE_SUCCESS(
+    nnfw_load_circle_from_buffer(_session1, model.cbuf.buffer(), model.cbuf.size()));
+  NNFW_ENSURE_SUCCESS(
+    nnfw_load_circle_from_buffer(_session2, model.cbuf.buffer(), model.cbuf.size()));
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session1, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session2, "cpu"));
+
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session2));
+
+  constexpr int input_count = N * H * W * C;
+  constexpr int output_count = N * H / 2 * W / 2 * C;
+
+  std::vector<float> in_buf1(input_count); // any value
+  std::vector<float> out_buf1(output_count);
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(_session1, 0, NNFW_TYPE_TENSOR_FLOAT32, in_buf1.data(),
+                                     in_buf1.size() * sizeof(float)));
+  NNFW_ENSURE_SUCCESS(nnfw_set_output(_session1, 0, NNFW_TYPE_TENSOR_FLOAT32, out_buf1.data(),
+                                      out_buf1.size() * sizeof(float)));
+
+  std::vector<float> in_buf2(input_count); // any value
+  std::vector<float> out_buf2(output_count);
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(_session2, 0, NNFW_TYPE_TENSOR_FLOAT32, in_buf2.data(),
+                                     in_buf2.size() * sizeof(float)));
+  NNFW_ENSURE_SUCCESS(nnfw_set_output(_session2, 0, NNFW_TYPE_TENSOR_FLOAT32, out_buf2.data(),
+                                      out_buf2.size() * sizeof(float)));
+
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session2));
+
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session2));
+
+  SUCCEED();
+}
+
+// TODO Write two-session-test with large models run by threads
diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
index 40d3f93..cb07919 100644
--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
@@ -21,7 +21,7 @@ TEST_F(ValidationTestSessionCreated, load_session_001)
 {
   // Existing model must
   ASSERT_EQ(nnfw_load_model_from_file(
-                _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+              _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
             NNFW_STATUS_NO_ERROR);
 }
 
@@ -36,7 +36,7 @@ TEST_F(ValidationTestSessionCreated, close_and_create_again)
 TEST_F(ValidationTestSessionCreated, neg_load_session_1)
 {
   ASSERT_EQ(nnfw_load_model_from_file(
-                _session, NNPackages::get().getModelAbsolutePath("nonexisting_directory").c_str()),
+              _session, NNPackages::get().getModelAbsolutePath("nonexisting_directory").c_str()),
             NNFW_STATUS_ERROR);
 }
 
@@ -50,25 +50,25 @@ TEST_F(ValidationTestSessionCreated, neg_load_session_3)
   // Too long path
   const std::string long_path(1024, 'x');
   ASSERT_EQ(nnfw_load_model_from_file(
-                _session, NNPackages::get().getModelAbsolutePath(long_path.c_str()).c_str()),
+              _session, NNPackages::get().getModelAbsolutePath(long_path.c_str()).c_str()),
             NNFW_STATUS_ERROR);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
 {
   ASSERT_EQ(
-      nnfw_load_model_from_file(
-          _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
-      NNFW_STATUS_ERROR);
+    nnfw_load_model_from_file(
+      _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
+    NNFW_STATUS_ERROR);
   ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
 {
-  ASSERT_EQ(nnfw_load_model_from_file(
-                _session,
-                NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
-            NNFW_STATUS_ERROR);
+  ASSERT_EQ(
+    nnfw_load_model_from_file(
+      _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
+    NNFW_STATUS_ERROR);
   ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
diff --git a/tests/nnfw_api/src/ValidationTestSingleSession.cc b/tests/nnfw_api/src/ValidationTestSingleSession.cc
index b134629..852d5cd 100644
--- a/tests/nnfw_api/src/ValidationTestSingleSession.cc
+++ b/tests/nnfw_api/src/ValidationTestSingleSession.cc
@@ -89,7 +89,7 @@ TEST_F(ValidationTestSingleSession, neg_load_model)
 {
   // Invalid state
   ASSERT_EQ(nnfw_load_model_from_file(
-                nullptr, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+              nullptr, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
             NNFW_STATUS_UNEXPECTED_NULL);
 }
 
diff --git a/tests/nnfw_api/src/fixtures.h b/tests/nnfw_api/src/fixtures.h
index 21be22f..15f51eb 100644
--- a/tests/nnfw_api/src/fixtures.h
+++ b/tests/nnfw_api/src/fixtures.h
@@ -23,6 +23,7 @@
 #include <nnfw_internal.h>
 
 #include "NNPackages.h"
+#include "CircleGen.h"
 
 #define NNFW_ENSURE_SUCCESS(EXPR) ASSERT_EQ((EXPR), NNFW_STATUS_NO_ERROR)
 
@@ -68,6 +69,7 @@ protected:
   {
     ValidationTestSingleSession::SetUp();
     ASSERT_EQ(nnfw_create_session(&_session), NNFW_STATUS_NO_ERROR);
+    ASSERT_NE(_session, nullptr);
   }
 
   void TearDown() override
@@ -77,16 +79,36 @@ protected:
   }
 };
 
+inline CircleBuffer genAddModel()
+{
+  CircleGen cgen;
+  std::vector<float> rhs_data{2};
+  uint32_t rhs_buf = cgen.addBuffer(rhs_data);
+  int lhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, 0, "X_input"});
+  int rhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, rhs_buf, "y_var"});
+  int out = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, 0, "ADD_TOP"});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs}, {out});
+  return cgen.finish();
+}
+
 template <int PackageNo> class ValidationTestModelLoaded : public ValidationTestSessionCreated
 {
 protected:
   void SetUp() override
   {
     ValidationTestSessionCreated::SetUp();
-    ASSERT_EQ(nnfw_load_model_from_file(_session,
-                                        NNPackages::get().getModelAbsolutePath(PackageNo).c_str()),
-              NNFW_STATUS_NO_ERROR);
-    ASSERT_NE(_session, nullptr);
+    if (PackageNo == NNPackages::ADD)
+    {
+      auto cbuf = genAddModel();
+      NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(_session, cbuf.buffer(), cbuf.size()));
+    }
+    else
+    {
+      // TODO Eventually, downloaded model tests are removed.
+      NNFW_ENSURE_SUCCESS(nnfw_load_model_from_file(
+        _session, NNPackages::get().getModelAbsolutePath(PackageNo).c_str()));
+    }
   }
 
   void TearDown() override { ValidationTestSessionCreated::TearDown(); }
@@ -114,8 +136,8 @@ protected:
     EXPECT_EQ(input_elements, 1);
     _input.resize(input_elements);
     ASSERT_EQ(
-        nnfw_set_input(_session, 0, ti_input.dtype, _input.data(), sizeof(float) * input_elements),
-        NNFW_STATUS_NO_ERROR);
+      nnfw_set_input(_session, 0, ti_input.dtype, _input.data(), sizeof(float) * input_elements),
+      NNFW_STATUS_NO_ERROR);
 
     nnfw_tensorinfo ti_output;
     ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &ti_output), NNFW_STATUS_NO_ERROR);
@@ -133,13 +155,13 @@ protected:
     uint64_t input_elements = num_elems(ti_input);
     _input.resize(input_elements);
     ASSERT_EQ(
-        nnfw_set_input(_session, 0, ti_input->dtype, _input.data(), sizeof(float) * input_elements),
-        NNFW_STATUS_NO_ERROR);
+      nnfw_set_input(_session, 0, ti_input->dtype, _input.data(), sizeof(float) * input_elements),
+      NNFW_STATUS_NO_ERROR);
 
     _output.resize(40000); // Give sufficient size for the output
-    ASSERT_EQ(nnfw_set_output(_session, 0, ti_input->dtype, _output.data(),
-                              sizeof(float) * _output.size()),
-              NNFW_STATUS_NO_ERROR);
+    ASSERT_EQ(
+      nnfw_set_output(_session, 0, ti_input->dtype, _output.data(), sizeof(float) * _output.size()),
+      NNFW_STATUS_NO_ERROR);
   }
 
 protected:
@@ -156,11 +178,12 @@ protected:
   {
     ValidationTest::SetUp();
 
-    auto model_path = NNPackages::get().getModelAbsolutePath(NNPackages::ADD);
     for (auto &obj : _objects)
     {
       ASSERT_EQ(nnfw_create_session(&obj.session), NNFW_STATUS_NO_ERROR);
-      ASSERT_EQ(nnfw_load_model_from_file(obj.session, model_path.c_str()), NNFW_STATUS_NO_ERROR);
+
+      auto cbuf = genAddModel();
+      NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(obj.session, cbuf.buffer(), cbuf.size()));
       ASSERT_EQ(nnfw_prepare(obj.session), NNFW_STATUS_NO_ERROR);
 
       uint32_t num_inputs;
@@ -206,4 +229,31 @@ protected:
   std::array<SessionObject, NUM_SESSIONS> _objects;
 };
 
+class ValidationTestTwoSessions : public ValidationTest
+{
+protected:
+  nnfw_session *_session1 = nullptr;
+  nnfw_session *_session2 = nullptr;
+};
+
+class ValidationTestTwoSessionsCreated : public ValidationTestTwoSessions
+{
+protected:
+  void SetUp() override
+  {
+    ValidationTestTwoSessions::SetUp();
+    ASSERT_EQ(nnfw_create_session(&_session1), NNFW_STATUS_NO_ERROR);
+    ASSERT_EQ(nnfw_create_session(&_session2), NNFW_STATUS_NO_ERROR);
+    ASSERT_NE(_session1, nullptr);
+    ASSERT_NE(_session2, nullptr);
+  }
+
+  void TearDown() override
+  {
+    ASSERT_EQ(nnfw_close_session(_session1), NNFW_STATUS_NO_ERROR);
+    ASSERT_EQ(nnfw_close_session(_session2), NNFW_STATUS_NO_ERROR);
+    ValidationTestTwoSessions::TearDown();
+  }
+};
+
 #endif // __NNFW_API_TEST_FIXTURES_H__
diff --git a/tests/nnfw_api/src/main.cc b/tests/nnfw_api/src/main.cc
index 741c0fb..ff04eb3 100644
--- a/tests/nnfw_api/src/main.cc
+++ b/tests/nnfw_api/src/main.cc
@@ -31,8 +31,8 @@ int main(int argc, char **argv)
   }
   catch (std::runtime_error &e)
   {
+    std::cerr << "[WARNING] Test models are not loaded, so some tests will fail" << std::endl;
     std::cerr << e.what() << std::endl;
-    return -1;
   }
 
   return RUN_ALL_TESTS();
diff --git a/tests/nnfw_api/src/one_op_tests/AddN.cc b/tests/nnfw_api/src/one_op_tests/AddN.cc
index cdb5295..73fa821 100644
--- a/tests/nnfw_api/src/one_op_tests/AddN.cc
+++ b/tests/nnfw_api/src/one_op_tests/AddN.cc
@@ -51,7 +51,24 @@ TEST_F(GenModelTest, neg_OneOp_AddN_InvalidType)
   cgen.setInputsAndOutputs({in1, in2, in3}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AddN_TypeDiff)
+{
+  CircleGen cgen;
+
+  int in1 = cgen.addTensor({{8}, circle::TensorType::TensorType_FLOAT32});
+  int in2 = cgen.addTensor({{8}, circle::TensorType::TensorType_FLOAT32});
+  int in3 = cgen.addTensor({{8}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{8}, circle::TensorType::TensorType_INT32});
+
+  cgen.addOperatorAddN({{in1, in2, in3}, {out}});
+  cgen.setInputsAndOutputs({in1, in2, in3}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->expectFailModelLoad();
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/ArgMax.cc b/tests/nnfw_api/src/one_op_tests/ArgMax.cc
deleted file mode 100644
index 67b02cd..0000000
--- a/tests/nnfw_api/src/one_op_tests/ArgMax.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "GenModelTest.h"
-
-#include <memory>
-
-TEST_F(GenModelTest, OneOp_ArgMax_AxisToConst)
-{
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT32;
-  std::vector<int32_t> axis_data{1};
-  uint32_t axis_buf = cgen.addBuffer(axis_data);
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(TestCaseData{}.addInput<float>({1, 4, 2, 3}).addOutput<int32_t>({1, 0}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-
-  SUCCEED();
-}
-
-TEST_F(GenModelTest, OneOp_ArgMax_Int64_AxisToConst)
-{
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT64;
-  std::vector<int32_t> axis_data{1};
-  uint32_t axis_buf = cgen.addBuffer(axis_data);
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(TestCaseData{}.addInput<float>({1, 4, 2, 3}).addOutput<int64_t>({1, 0}));
-  _context->setBackends({"acl_cl"});
-
-  SUCCEED();
-}
-
-TEST_F(GenModelTest, OneOp_ArgMax_AxisToVar)
-{
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT32;
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in, axis}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(TestCaseData{}
-                            .addInput<float>({1, 4, 2, 3})
-                            .addInput<int32_t>({-3})
-                            .addOutput<int32_t>({1, 0}));
-  _context->setBackends({"cpu"});
-
-  SUCCEED();
-}
-
-TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis0)
-{
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT32;
-  std::vector<int32_t> axis_data{4};
-  uint32_t axis_buf = cgen.addBuffer(axis_data);
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->expectFailCompile();
-
-  SUCCEED();
-}
-
-TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis1)
-{
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT32;
-  std::vector<int32_t> axis_data{-3};
-  uint32_t axis_buf = cgen.addBuffer(axis_data);
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{2}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->expectFailCompile();
-
-  SUCCEED();
-}
diff --git a/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc b/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc
new file mode 100644
index 0000000..3df7e74
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+struct ArgMinMaxVariationParam
+{
+  TestCaseData tcd;
+  bool is_argmax = true;
+  circle::TensorType input_type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+};
+
+class ArgMinMaxVariation : public GenModelTest,
+                           public ::testing::WithParamInterface<ArgMinMaxVariationParam>
+{
+};
+
+// Input shape: {1, 2, 2, 1}
+// Reduce axis: 1
+// Output shape: {1, 2, 1}
+// Output type: Int32
+TEST_P(ArgMinMaxVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+// Test with different input type and value
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, ArgMinMaxVariation,
+  ::testing::Values(
+    // ArgMax, float input
+    ArgMinMaxVariationParam{TestCaseData{}.addInput<float>({1, 4, 2, 3}).addOutput<int32_t>({1, 0}),
+                            true},
+    // ArgMax, int32 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<int32_t>({1, 4, 2, 3}).addOutput<int32_t>({1, 0}), true,
+      circle::TensorType::TensorType_INT32},
+    // ArgMax, uint8 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<uint8_t>({1, 4, 2, 3}).addOutput<int32_t>({1, 0}), true,
+      circle::TensorType::TensorType_UINT8, 1.0, 1},
+    // ArgMax, int8 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<int8_t>({1, 4, 2, 3}).addOutput<int32_t>({1, 0}), true,
+      circle::TensorType::TensorType_INT8, 1.0, 1},
+    // ArgMin, float input
+    ArgMinMaxVariationParam{TestCaseData{}.addInput<float>({1, 4, 2, 3}).addOutput<int32_t>({0, 1}),
+                            false},
+    // ArgMin, int32 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<int32_t>({1, 4, 2, 3}).addOutput<int32_t>({0, 1}), false,
+      circle::TensorType::TensorType_INT32},
+    // ArgMin, uint8 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<uint8_t>({1, 4, 2, 3}).addOutput<int32_t>({0, 1}), false,
+      circle::TensorType::TensorType_UINT8, 1.0, 1},
+    // ArgMin, int8 input
+    ArgMinMaxVariationParam{
+      TestCaseData{}.addInput<int8_t>({1, 4, 2, 3}).addOutput<int32_t>({0, 1}), false,
+      circle::TensorType::TensorType_INT8, 1.0, 1}));
+
+TEST_F(GenModelTest, OneOp_ArgMax_Int64_AxisToConst)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT64;
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(TestCaseData{}.addInput<float>({1, 4, 2, 3}).addOutput<int64_t>({1, 0}));
+  _context->setBackends({"acl_cl", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ArgMax_AxisToVar)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in, axis}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(TestCaseData{}
+                          .addInput<float>({1, 4, 2, 3})
+                          .addInput<int32_t>({-3})
+                          .addOutput<int32_t>({1, 0}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis0)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis1)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{-3};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_InType)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_BOOL});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_AxisType)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_FLOAT32;
+  std::vector<float> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_OutType)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_FLOAT32;
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_paramType)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, circle::TensorType::TensorType_INT64);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Cast.cc b/tests/nnfw_api/src/one_op_tests/Cast.cc
index 5cbe09d..928df2d 100644
--- a/tests/nnfw_api/src/one_op_tests/Cast.cc
+++ b/tests/nnfw_api/src/one_op_tests/Cast.cc
@@ -34,7 +34,7 @@ TEST_F(GenModelTest, OneOp_Cast_Int32ToFloat32)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({1, 2, 3, 4}).addOutput<float>({1, 2, 3, 4}));
+    TestCaseData{}.addInput<int32_t>({1, 2, 3, 4}).addOutput<float>({1, 2, 3, 4}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -46,7 +46,7 @@ TEST_F(GenModelTest, OneOp_Cast_Float32ToInt32)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<float>({1, 2, 3, 4}).addOutput<int32_t>({1, 2, 3, 4}));
+    TestCaseData{}.addInput<float>({1, 2, 3, 4}).addOutput<int32_t>({1, 2, 3, 4}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -58,7 +58,7 @@ TEST_F(GenModelTest, OneOp_Cast_BoolToFloat32)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<bool>({true, false, true, true}).addOutput<float>({1, 0, 1, 1}));
+    TestCaseData{}.addInput<bool>({true, false, true, true}).addOutput<float>({1, 0, 1, 1}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -70,8 +70,8 @@ TEST_F(GenModelTest, OneOp_Cast_BoolToUInt8)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<bool>({true, false, true, true})
-                            .addOutput(std::vector<uint8_t>{1, 0, 1, 1}));
+                          .addInput<bool>({true, false, true, true})
+                          .addOutput(std::vector<uint8_t>{1, 0, 1, 1}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -83,7 +83,7 @@ TEST_F(GenModelTest, OneOp_Cast_BoolToInt32)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<bool>({true, false, true, true}).addOutput<int32_t>({1, 0, 1, 1}));
+    TestCaseData{}.addInput<bool>({true, false, true, true}).addOutput<int32_t>({1, 0, 1, 1}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Concat.cc b/tests/nnfw_api/src/one_op_tests/Concat.cc
index 0f40338..6e24359 100644
--- a/tests/nnfw_api/src/one_op_tests/Concat.cc
+++ b/tests/nnfw_api/src/one_op_tests/Concat.cc
@@ -37,33 +37,76 @@ TEST_F(GenModelTest, OneOp_Concat_ShareSubTensor)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(uniformTCD<float>(
-      {{1, 3, 2, 4}, {5, 4, 7, 4}},
-      {{0, 0, 0, 0, 0, 6, 7, 0, 0, 9, 8, 0, 0, 0, 0, 0}, {5, 6, 4, 7, 7, 9, 4, 8}}));
+    {{1, 3, 2, 4}, {5, 4, 7, 4}},
+    {{0, 0, 0, 0, 0, 6, 7, 0, 0, 9, 8, 0, 0, 0, 0, 0}, {5, 6, 4, 7, 7, 9, 4, 8}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
-TEST_F(GenModelTest, OneOp_Concat)
+struct ConcatVariationParam
 {
-  CircleGen cgen;
+  TestCaseData tcd;
+  circle::TensorType type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+};
+
+class ConcatVariation : public GenModelTest,
+                        public ::testing::WithParamInterface<ConcatVariationParam>
+{
+};
 
-  int input1 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int input2 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int output = cgen.addTensor({{4, 3}, circle::TensorType::TensorType_FLOAT32});
+// Input shape: {2, 3} / {2, 3}
+// Output shape: {4, 3}
+TEST_P(ConcatVariation, Test)
+{
+  auto &param = GetParam();
 
+  CircleGen cgen;
+  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int input2 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int output = cgen.addTensor({{4, 3}, param.type}, param.scale, param.zero_point);
   cgen.addOperatorConcatenation({{input1, input2}, {output}}, 0,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({input1, input2}, {output});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
-                                          {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}));
+  _context->addTestCase(param.tcd);
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, ConcatVariation,
+  ::testing::Values(
+    // Float
+    ConcatVariationParam{uniformTCD<float>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                           {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}})},
+    // Uint8
+    ConcatVariationParam{uniformTCD<uint8_t>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                             {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
+                         circle::TensorType::TensorType_UINT8, 1.0f, -2},
+    // Int8
+    ConcatVariationParam{uniformTCD<int8_t>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                            {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
+                         circle::TensorType::TensorType_INT8, 1.0f, -2},
+    // Int16
+    // TODO Enable when nnfw api support int16 type
+    // ConcatVariationParam{
+    //    uniformTCD<int16_t>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+    //                                  {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
+    //    circle::TensorType::TensorType_INT16, 1.0f, 0},
+    // Int32
+    ConcatVariationParam{uniformTCD<int32_t>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                             {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
+                         circle::TensorType::TensorType_INT32},
+    // Int64
+    ConcatVariationParam{uniformTCD<int64_t>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                             {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
+                         circle::TensorType::TensorType_INT64}));
+
 TEST_F(GenModelTest, OneOp_Concat_Subtensor_4D)
 {
   CircleGen cgen;
@@ -112,26 +155,26 @@ TEST_F(GenModelTest, OneOp_Concat_Subtensor_4D)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(uniformTCD<float>(
-      {
-          // inputs
-          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, // in1
-          {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}                                           // in2
-      },
-      {
-          // outputs
-          {1, 2, 3, 4, 5},                     // s_out1
-          {6, 7, 8, 9, 10},                    // s_out2
-          {11, 12, 13, 14, 15},                // s_out3
-          {16, 17, 18, 19, 20},                // s_out4
-          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // c_out1
-          {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // c_out2
-          {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // c_out3
-          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // a_out1
-          {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // a_out2
-          {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // a_out3
-          {1, 2, 3,  4,  5,  1,  2,  3, 4, 5, 6, 7, 8,  9,  10, 1,  2, 3,
-           4, 5, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20} // final_out
-      }));
+    {
+      // inputs
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, // in1
+      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}                                           // in2
+    },
+    {
+      // outputs
+      {1, 2, 3, 4, 5},                     // s_out1
+      {6, 7, 8, 9, 10},                    // s_out2
+      {11, 12, 13, 14, 15},                // s_out3
+      {16, 17, 18, 19, 20},                // s_out4
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // c_out1
+      {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // c_out2
+      {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // c_out3
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // a_out1
+      {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // a_out2
+      {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // a_out3
+      {1, 2, 3,  4,  5,  1,  2,  3, 4, 5, 6, 7, 8,  9,  10, 1,  2, 3,
+       4, 5, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20} // final_out
+    }));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Conv2D.cc b/tests/nnfw_api/src/one_op_tests/Conv2D.cc
new file mode 100644
index 0000000..3822263
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Conv2D.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_Conv2D)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE, 1, 1);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+    {{4, 0, -5, 1, 0, 4, -1, 1, -1, -3, 3, -2, -4, 1, -2, 2, 4, -4, 2, 2, 0, 4, -1, -2, 4}},
+    {{47, -4, -25, 9, 10, 10, -13, 11, -14, -26, -12, 26, 20, 40, 1, 3, 11, 4}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "ruy", "xnnpack"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Conv2D_Stride)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_SAME, 2, 2,
+                         circle::ActivationFunctionType_NONE, 1, 1);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+    {{4, 0, -5, 1, 0, 4, -1, 1, -1, -3, 3, -2, -4, 1, -2, 2, 4, -4, 2, 2, 0, 4, -1, -2, 4}},
+    {{22, 27, -10, -2, 5, -8, 7, 3, -14, -26, -10, 18, 4, -13, -28, 9, 14, 1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "ruy", "xnnpack"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Conv2D_Dilation)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE, 2, 2);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+    {{4, 0, -5, 1, 0, 4, -1, 1, -1, -3, 3, -2, -4, 1, -2, 2, 4, -4, 2, 2, 0, 4, -1, -2, 4}},
+    {{-52, 7}}));
+  _context->setBackends({"cpu", "ruy", "xnnpack"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Conv2D_Type)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT16});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE, 1, 1);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Conv2D_Stride)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_SAME, 0, 0,
+                         circle::ActivationFunctionType_NONE, 1, 1);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Conv2D_Dilation)
+{
+  CircleGen cgen;
+  std::vector<float> weight_data{-2, 3, -5, 3, 4, 4, 0, 0, -4, -1, -4, -2, 0, 2, 0, -1, 4, 0};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 5, 5, 1}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{2, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE, 0, 0);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc b/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc
new file mode 100644
index 0000000..9f56340
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+struct DepthToSpaceVariationParam
+{
+  TestCaseData tcd;
+  circle::TensorType type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+};
+
+class DepthToSpaceVariation : public GenModelTest,
+                              public ::testing::WithParamInterface<DepthToSpaceVariationParam>
+{
+};
+
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, DepthToSpaceVariation,
+  ::testing::Values(
+    // Float
+    DepthToSpaceVariationParam{
+      uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}})},
+    // Int32
+    DepthToSpaceVariationParam{
+      uniformTCD<int32_t>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}}),
+      circle::TensorType::TensorType_INT32},
+    // Int64
+    DepthToSpaceVariationParam{
+      uniformTCD<int64_t>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}}),
+      circle::TensorType::TensorType_INT64},
+    // Uint8
+    DepthToSpaceVariationParam{
+      uniformTCD<uint8_t>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}}),
+      circle::TensorType::TensorType_UINT8, 1.0f, -2},
+    // Int8
+    DepthToSpaceVariationParam{
+      uniformTCD<int8_t>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}}),
+      circle::TensorType::TensorType_INT8, 1.0f, -2}));
+
+// Input shape: {1, 1, 2, 4}
+// Block size: 2
+// Output shape: {1, 2, 4, 1}
+TEST_P(DepthToSpaceVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 1, 2, 4}, param.type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({{1, 2, 4, 1}, param.type}, param.scale, param.zero_point);
+  cgen.addOperatorDepthToSpace({{in}, {out}}, 2);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_DepthToSpace_Blocksize)
+{
+  CircleGen cgen;
+  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
+  int in = cgen.addTensor({{1, 1, 2, 4}, data_type});
+  int out = cgen.addTensor({{1, 2, 4, 1}, data_type});
+  cgen.addOperatorDepthToSpace({{in}, {out}}, -2);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
index 56ae296..87c67f1 100644
--- a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
@@ -34,7 +34,7 @@ TEST_F(GenModelTest, OneOp_DepthwiseConv2D)
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(uniformTCD<float>({{1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}},
                                           {{71, -34, 99, -20, 91, -26, 127, -4}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack"});
 
   SUCCEED();
 }
@@ -56,11 +56,11 @@ TEST_F(GenModelTest, OneOp_DepthwiseConv2D_Dilation)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(uniformTCD<float>({{
-                                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-                                              0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+                                            0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           }},
                                           {{13, 14, 0, 0, 0, 0, 11, 12, 5, 6, 0, 0, 0, 0, 3, 4}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack"});
 
   SUCCEED();
 }
@@ -84,7 +84,7 @@ TEST_F(GenModelTest, OneOp_DepthwiseConv2D_Dilation_N_Stride)
   _context->addTestCase(uniformTCD<float>({{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
                                             0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
                                           {{4, 0, 3, 0, 0, 0, 2, 0, 1}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack"});
 
   SUCCEED();
 }
@@ -182,11 +182,11 @@ CircleBuffer genSimpleDepthwiseConv2DQuantizedModel(int stride, int input_depth,
 
   CircleGen cgen;
   uint32_t ker_buf = cgen.addBuffer(std::vector<uint8_t>{
-      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-      2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-      2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
   uint32_t bias_buf = cgen.addBuffer(std::vector<int32_t>(output_depth, 0));
   int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_UINT8}, 0.5, 0);
   int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_UINT8, ker_buf}, 0.5, 0);
@@ -214,14 +214,13 @@ class DepthwiseConv2DVariation : public GenModelTest,
 TEST_P(DepthwiseConv2DVariation, Test)
 {
   // Same input is used for all tests but output differs
-  static const std::vector<uint8_t> input64{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-                                            5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
-                                            2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8,
-                                            2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
+  static const std::vector<uint8_t> input64{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
+    2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
 
   auto &param = GetParam();
   _context = std::make_unique<GenModelTestContext>(genSimpleDepthwiseConv2DQuantizedModel(
-      param.stride, param.input_depth, param.depth_multiplier));
+    param.stride, param.input_depth, param.depth_multiplier));
   std::vector<uint8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
   _context->addTestCase(uniformTCD<uint8_t>({ref_input}, {param.ref_output}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
@@ -232,45 +231,45 @@ TEST_P(DepthwiseConv2DVariation, Test)
 // Test with different InputDepth and DepthMultiplier. The values are intended to test optimized CPU
 // kernels.
 INSTANTIATE_TEST_CASE_P(
-    GenModelTest, DepthwiseConv2DVariation,
-    ::testing::Values(
-        // Stride == 1
-        DepthwiseConv2DVariationParam{1, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
-        DepthwiseConv2DVariationParam{1, 4, 2, std::vector<uint8_t>{0, 0, 2, 3, 0, 2, 6, 9}},
-        DepthwiseConv2DVariationParam{
-            1, 2, 8, std::vector<uint8_t>{0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 4, 6, 0, 2, 4, 6}},
-        DepthwiseConv2DVariationParam{1, 2, 2, std::vector<uint8_t>{0, 1, 4, 6}},
-        DepthwiseConv2DVariationParam{1, 2, 1, std::vector<uint8_t>{2, 5}},
-        DepthwiseConv2DVariationParam{1, 1, 2, std::vector<uint8_t>{2, 4}},
-        DepthwiseConv2DVariationParam{1, 1, 4, std::vector<uint8_t>{0, 2, 3, 5}},
-        DepthwiseConv2DVariationParam{1, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
-        DepthwiseConv2DVariationParam{
-            1, 4, 4, std::vector<uint8_t>{0, 0, 0, 0, 0, 1, 2, 3, 0, 2, 4, 6, 0, 3, 6, 9}},
-        DepthwiseConv2DVariationParam{1, 12, 1,
-                                      std::vector<uint8_t>{0, 3, 7, 12, 0, 4, 7, 12, 0, 4, 9, 16}},
-        // Stride == 2
-        DepthwiseConv2DVariationParam{2, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
-        DepthwiseConv2DVariationParam{2, 2, 1, std::vector<uint8_t>{2, 5}},
-        DepthwiseConv2DVariationParam{2, 1, 8, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5}},
-        DepthwiseConv2DVariationParam{
-            2, 1, 32, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5,
-                                           0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-        DepthwiseConv2DVariationParam{2, 1, 20, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2,
-                                                                     3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-        DepthwiseConv2DVariationParam{
-            2, 1, 16, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-        DepthwiseConv2DVariationParam{2, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
-        DepthwiseConv2DVariationParam{
-            2, 8, 2, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8}},
-        DepthwiseConv2DVariationParam{
-            2, 16, 1, std::vector<uint8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
+  GenModelTest, DepthwiseConv2DVariation,
+  ::testing::Values(
+    // Stride == 1
+    DepthwiseConv2DVariationParam{1, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DVariationParam{1, 4, 2, std::vector<uint8_t>{0, 0, 2, 3, 0, 2, 6, 9}},
+    DepthwiseConv2DVariationParam{
+      1, 2, 8, std::vector<uint8_t>{0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 4, 6, 0, 2, 4, 6}},
+    DepthwiseConv2DVariationParam{1, 2, 2, std::vector<uint8_t>{0, 1, 4, 6}},
+    DepthwiseConv2DVariationParam{1, 2, 1, std::vector<uint8_t>{2, 5}},
+    DepthwiseConv2DVariationParam{1, 1, 2, std::vector<uint8_t>{2, 4}},
+    DepthwiseConv2DVariationParam{1, 1, 4, std::vector<uint8_t>{0, 2, 3, 5}},
+    DepthwiseConv2DVariationParam{1, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DVariationParam{
+      1, 4, 4, std::vector<uint8_t>{0, 0, 0, 0, 0, 1, 2, 3, 0, 2, 4, 6, 0, 3, 6, 9}},
+    DepthwiseConv2DVariationParam{1, 12, 1,
+                                  std::vector<uint8_t>{0, 3, 7, 12, 0, 4, 7, 12, 0, 4, 9, 16}},
+    // Stride == 2
+    DepthwiseConv2DVariationParam{2, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DVariationParam{2, 2, 1, std::vector<uint8_t>{2, 5}},
+    DepthwiseConv2DVariationParam{2, 1, 8, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DVariationParam{2, 1, 32, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3,
+                                                                 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2,
+                                                                 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DVariationParam{
+      2, 1, 20, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DVariationParam{
+      2, 1, 16, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DVariationParam{2, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DVariationParam{
+      2, 8, 2, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DVariationParam{
+      2, 16, 1, std::vector<uint8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
 
 TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_InvalidPaddingType)
 {
   _context = std::make_unique<GenModelTestContext>(genNegTestDepthwiseConv2DModel(
-      static_cast<circle::Padding>(99), 1, 1, 1, circle::ActivationFunctionType_NONE));
+    static_cast<circle::Padding>(99), 1, 1, 1, circle::ActivationFunctionType_NONE));
   _context->expectFailModelLoad();
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack"});
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Elu.cc b/tests/nnfw_api/src/one_op_tests/Elu.cc
new file mode 100644
index 0000000..a037070
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Elu.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_Elu)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorElu({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    uniformTCD<float>({{0, -6, 2, -4, 3, -2, 10, -0.1}},
+                      {{0.0, -0.997521, 2.0, -0.981684, 3.0, -0.864665, 10.0, -0.0951626}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Elu_Type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_UINT8}, 1.0f, 0);
+  int out = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorElu({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Equal.cc b/tests/nnfw_api/src/one_op_tests/Equal.cc
index 26e52fd..9f79575 100644
--- a/tests/nnfw_api/src/one_op_tests/Equal.cc
+++ b/tests/nnfw_api/src/one_op_tests/Equal.cc
@@ -27,9 +27,9 @@ TEST_F(GenModelTest, OneOp_Equal)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<float>({0.1, 0.3, 0.5, 0.7})
-                            .addInput<float>({0.1, 0.2, 0.3, 0.4})
-                            .addOutput<bool>({true, false, false, false}));
+                          .addInput<float>({0.1, 0.3, 0.5, 0.7})
+                          .addInput<float>({0.1, 0.2, 0.3, 0.4})
+                          .addOutput<bool>({true, false, false, false}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/ExpandDims.cc b/tests/nnfw_api/src/one_op_tests/ExpandDims.cc
new file mode 100644
index 0000000..280cf73
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/ExpandDims.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_ExpandDims)
+{
+  CircleGen cgen;
+
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int in = cgen.addTensor({{1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{1, 1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorExpandDims({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}.addInput<float>({0.1, 0.3, 0.5, 0.7}).addOutput<float>({0.1, 0.3, 0.5, 0.7}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ExpandDims_Int64AxisNeg)
+{
+  CircleGen cgen;
+
+  std::vector<int64_t> axis_data{-1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int in = cgen.addTensor({{1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT64, axis_buf});
+  int out = cgen.addTensor({{1, 4, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorExpandDims({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}.addInput<float>({0.1, 0.3, 0.5, 0.7}).addOutput<float>({0.1, 0.3, 0.5, 0.7}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_neg_ExpandDims_Axis)
+{
+  CircleGen cgen;
+
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int in = cgen.addTensor({{1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{1, 1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorExpandDims({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_neg_ExpandDims_AxisNegInput)
+{
+  CircleGen cgen;
+
+  int in = cgen.addTensor({{1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
+  int out = cgen.addTensor({{1, 1, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorExpandDims({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in, axis}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(TestCaseData{}
+                          .addInput<float>({0.1, 0.3, 0.5, 0.7})
+                          .addInput<int32_t>({-5})
+                          .addOutput<float>({0.1, 0.3, 0.5, 0.7})
+                          .expectFailRun());
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Fill.cc b/tests/nnfw_api/src/one_op_tests/Fill.cc
index cf8948d..4d5e4d8 100644
--- a/tests/nnfw_api/src/one_op_tests/Fill.cc
+++ b/tests/nnfw_api/src/one_op_tests/Fill.cc
@@ -16,61 +16,78 @@
 
 #include "GenModelTest.h"
 
-TEST_F(GenModelTest, OneOp_Fill_Int32)
+struct FillVariationParam
 {
-  CircleGen cgen;
-  std::vector<int32_t> value_data{13};
-  uint32_t value_buf = cgen.addBuffer(value_data);
-
-  int in = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
-  int value = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, value_buf});
-  int out = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_INT32});
-  cgen.addOperatorFill({{in, value}, {out}});
-  cgen.setInputsAndOutputs({in}, {out});
+  TestCaseData tcd;
+  const uint8_t *value_data = nullptr;
+  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
+};
 
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int32_t>({13, 13, 13, 13, 13, 13}));
-  _context->setBackends({"cpu"});
-
-  SUCCEED();
-}
+class FillVariation : public GenModelTest, public ::testing::WithParamInterface<FillVariationParam>
+{
+};
 
-TEST_F(GenModelTest, OneOp_Fill_Int64)
+// value is constant
+TEST_P(FillVariation, Test)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  std::vector<int64_t> value_data{13};
-  uint32_t value_buf = cgen.addBuffer(value_data);
 
-  int in = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
-  int value = cgen.addTensor({{1}, circle::TensorType::TensorType_INT64, value_buf});
-  int out = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_INT64});
-  cgen.addOperatorFill({{in, value}, {out}});
-  cgen.setInputsAndOutputs({in}, {out});
+  size_t value_size =
+    (param.data_type == circle::TensorType::TensorType_INT64) ? sizeof(int64_t) : sizeof(int32_t);
+  uint32_t value_buf = cgen.addBuffer(param.value_data, value_size);
+
+  int dims = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
+  int value = cgen.addTensor({{1}, param.data_type, value_buf});
+  int out = cgen.addTensor({{2, 3}, param.data_type});
+  cgen.addOperatorFill({{dims, value}, {out}});
+  cgen.setInputsAndOutputs({dims}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int64_t>({13, 13, 13, 13, 13, 13}));
+  _context->addTestCase(param.tcd);
   _context->setBackends({"cpu"});
 
   SUCCEED();
 }
 
-TEST_F(GenModelTest, OneOp_Fill_Float32)
+const int32_t test_int32 = 13;
+const int64_t test_int64 = 1052;
+const float test_float = 5.2;
+
+// Test with different value type
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, FillVariation,
+  ::testing::Values(
+    // float value
+    FillVariationParam{
+      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<float>({5.2, 5.2, 5.2, 5.2, 5.2, 5.2}),
+      reinterpret_cast<const uint8_t *>(&test_float)},
+    // int32 value
+    FillVariationParam{
+      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int32_t>({13, 13, 13, 13, 13, 13}),
+      reinterpret_cast<const uint8_t *>(&test_int32), circle::TensorType::TensorType_INT32},
+    // uint8 value
+    FillVariationParam{
+      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int64_t>({1052, 1052, 1052, 1052, 1052,
+                                                                   1052}),
+      reinterpret_cast<const uint8_t *>(&test_int64), circle::TensorType::TensorType_INT64}));
+
+TEST_F(GenModelTest, OneOp_Fill_Int64_Shape)
 {
   CircleGen cgen;
   std::vector<float> value_data{1.3};
   uint32_t value_buf = cgen.addBuffer(value_data);
 
-  int in = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
+  int dims = cgen.addTensor({{2}, circle::TensorType::TensorType_INT64});
   int value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, value_buf});
   int out = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorFill({{in, value}, {out}});
-  cgen.setInputsAndOutputs({in}, {out});
+  cgen.addOperatorFill({{dims, value}, {out}});
+  cgen.setInputsAndOutputs({dims}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<float>({1.3, 1.3, 1.3, 1.3, 1.3, 1.3}));
+    TestCaseData{}.addInput<int64_t>({2, 3}).addOutput<float>({1.3, 1.3, 1.3, 1.3, 1.3, 1.3}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -87,7 +104,7 @@ TEST_F(GenModelTest, neg_OneOp_Fill_Int32_oneoperand)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int32_t>({13, 13, 13, 13, 13, 13}));
+    TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int32_t>({13, 13, 13, 13, 13, 13}));
   _context->setBackends({"cpu"});
   _context->expectFailModelLoad();
 
@@ -105,7 +122,7 @@ TEST_F(GenModelTest, neg_OneOp_Fill_Int64_oneoperand)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int64_t>({13, 13, 13, 13, 13, 13}));
+    TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<int64_t>({13, 13, 13, 13, 13, 13}));
   _context->setBackends({"cpu"});
   _context->expectFailModelLoad();
 
@@ -123,7 +140,7 @@ TEST_F(GenModelTest, neg_OneOp_Fill_Float32_oneoperand)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<float>({1.3, 1.3, 1.3, 1.3, 1.3, 1.3}));
+    TestCaseData{}.addInput<int32_t>({2, 3}).addOutput<float>({1.3, 1.3, 1.3, 1.3, 1.3, 1.3}));
   _context->setBackends({"cpu"});
   _context->expectFailModelLoad();
 
diff --git a/tests/nnfw_api/src/one_op_tests/FullyConnected.cc b/tests/nnfw_api/src/one_op_tests/FullyConnected.cc
index a7c01e1..791787f 100644
--- a/tests/nnfw_api/src/one_op_tests/FullyConnected.cc
+++ b/tests/nnfw_api/src/one_op_tests/FullyConnected.cc
@@ -51,8 +51,8 @@ TEST_F(GenModelTest, OneOp_FullyConnected)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
-  _context->setBackends({"cpu", "acl_neon"});
+    uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
+  _context->setBackends({"cpu", "acl_neon", "xnnpack", "ruy"});
 
   SUCCEED();
 }
@@ -80,7 +80,7 @@ TEST_F(GenModelTest, OneOp_FullyConnectedShuffled16x1Float32)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
+    uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -129,12 +129,12 @@ TEST_F(GenModelTest, OneOp_FullyConnected16x1Sparse)
   uint32_t bias_buf = cgen.addBuffer(bias_data);
   int input = cgen.addTensor({{1, 4}, circle::TensorType::TensorType_FLOAT32});
   CircleGen::SparsityParams sp{
-      {0, 1, 2, 3},
-      {0, 1},
-      {{CircleGen::SparseDimensionType::DimensionType_DENSE, 1},
-       {CircleGen::SparseDimensionType::DimensionType_SPARSE_CSR, {0, 2}, {0, 3}},
-       {CircleGen::SparseDimensionType::DimensionType_DENSE, 16},
-       {CircleGen::SparseDimensionType::DimensionType_DENSE, 1}}};
+    {0, 1, 2, 3},
+    {0, 1},
+    {{CircleGen::SparseDimensionType::DimensionType_DENSE, 1},
+     {CircleGen::SparseDimensionType::DimensionType_SPARSE_CSR, {0, 2}, {0, 3}},
+     {CircleGen::SparseDimensionType::DimensionType_DENSE, 16},
+     {CircleGen::SparseDimensionType::DimensionType_DENSE, 1}}};
   int weight = cgen.addTensor({{16, 4}, circle::TensorType::TensorType_FLOAT32, weight_buf}, sp);
   int bias = cgen.addTensor({{16}, circle::TensorType::TensorType_FLOAT32, bias_buf});
   int output = cgen.addTensor({{1, 16}, circle::TensorType::TensorType_FLOAT32});
@@ -143,7 +143,7 @@ TEST_F(GenModelTest, OneOp_FullyConnected16x1Sparse)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
+    uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -171,9 +171,9 @@ TEST_F(GenModelTest, OneOp_FullyConnected_OptionalBias)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{3, -1, -1, 1, -2, 0, -2, 1}},
-                        {{-4, -2, 9, -6, 8, 13, 5, 18, 5, -3, -7, -2, -16, -5, -1, -1}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+    uniformTCD<float>({{3, -1, -1, 1, -2, 0, -2, 1}},
+                      {{-4, -2, 9, -6, 8, 13, 5, 18, 5, -3, -7, -2, -16, -5, -1, -1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack", "ruy"});
 
   SUCCEED();
 }
@@ -200,9 +200,9 @@ TEST_F(GenModelTest, neg_OneOp_FullyConnected_NoBias)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{3, -1, -1, 1, -2, 0, -2, 1}},
-                        {{-4, -2, 9, -6, 8, 13, 5, 18, 5, -3, -7, -2, -16, -5, -1, -1}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+    uniformTCD<float>({{3, -1, -1, 1, -2, 0, -2, 1}},
+                      {{-4, -2, 9, -6, 8, 13, 5, 18, 5, -3, -7, -2, -16, -5, -1, -1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu", "xnnpack", "ruy"});
   _context->expectFailCompile();
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/L2Normalization.cc b/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
index 8e0ae6d..f825fec 100644
--- a/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
+++ b/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
@@ -27,9 +27,9 @@ TEST_F(GenModelTest, OneOp_L2Normalization)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{0, 3, 4, 0, 5, 12, 0, 8, 15, 0, 7, 24}},
-                        {{0, 0.6, 0.8, 0, 0.38461539149284363, 0.92307698726654053, 0,
-                          0.47058823704719543, 0.88235294818878174, 0, 0.28, 0.96}}));
+    uniformTCD<float>({{0, 3, 4, 0, 5, 12, 0, 8, 15, 0, 7, 24}},
+                      {{0, 0.6, 0.8, 0, 0.38461539149284363, 0.92307698726654053, 0,
+                        0.47058823704719543, 0.88235294818878174, 0, 0.28, 0.96}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc b/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
index db1a375..cb3af4e 100644
--- a/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
+++ b/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
@@ -26,8 +26,8 @@ TEST_F(GenModelTest, OneOp_LeakyRelu)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{0, 1.0, 3.0, 1.0, -1.0, -2.0f}}, {{0, 1.0, 3.0, 1.0, -0.5, -1.0}}));
-  _context->setBackends({"acl_cl", "acl_neon"});
+    uniformTCD<float>({{0, 1.0, 3.0, 1.0, -1.0, -2.0f}}, {{0, 1.0, 3.0, 1.0, -0.5, -1.0}}));
+  _context->setBackends({"cpu", "acl_cl", "acl_neon"});
 
   SUCCEED();
 }
@@ -41,7 +41,7 @@ TEST_F(GenModelTest, neg_OneOp_LeakyRelu_InvalidType)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"acl_cl", "acl_neon"});
+  _context->setBackends({"cpu", "acl_cl", "acl_neon"});
   _context->expectFailModelLoad();
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc b/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc
index b34b2e8..5834fa5 100644
--- a/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc
+++ b/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc
@@ -30,8 +30,8 @@ TEST_F(GenModelTest, OneOp_LogSoftmax)
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"cpu"});
   _context->addTestCase(uniformTCD<float>(
-      {{0, -6, 2, 4, 3, -2, 10, 1}},
-      {{-.00247565, -6.00247, -2.12692, -.126928, -.00671534, -5.00671, -.000123374, -9.00012}}));
+    {{0, -6, 2, 4, 3, -2, 10, 1}},
+    {{-.00247565, -6.00247, -2.12692, -.126928, -.00671534, -5.00671, -.000123374, -9.00012}}));
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Mean.cc b/tests/nnfw_api/src/one_op_tests/Mean.cc
new file mode 100644
index 0000000..6293d38
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Mean.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+CircleBuffer genSimpleMeanModel()
+{
+  CircleGen cgen;
+  uint32_t axis_buf = cgen.addBuffer(std::vector<int32_t>{1, 2});
+  int in = cgen.addTensor({{1, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorMean({{in, axis}, {out}}, true);
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen.finish();
+}
+
+TEST_F(GenModelTest, OneOp_Mean)
+{
+  auto model = genSimpleMeanModel();
+  _context = std::make_unique<GenModelTestContext>(std::move(model));
+  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8, 9}}, {{5}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+CircleBuffer genWrongMeanModel()
+{
+  CircleGen cgen;
+  uint32_t axis_buf = cgen.addBuffer(std::vector<int32_t>{1, 2});
+  int in = cgen.addTensor({{1, 3, 3, 1}, circle::TensorType::TensorType_BOOL});
+  int axis = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{1}, circle::TensorType::TensorType_BOOL});
+  cgen.addOperatorMean({{in, axis}, {out}}, true);
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen.finish();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Mean)
+{
+  auto model = genWrongMeanModel();
+  _context = std::make_unique<GenModelTestContext>(std::move(model));
+  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8, 9}}, {{5}}));
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/OneHot.cc b/tests/nnfw_api/src/one_op_tests/OneHot.cc
index 11df5bc..78ad35b 100644
--- a/tests/nnfw_api/src/one_op_tests/OneHot.cc
+++ b/tests/nnfw_api/src/one_op_tests/OneHot.cc
@@ -36,9 +36,9 @@ TEST_F(GenModelTest, OneOp_OneHot_OffValueToConst)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<int32_t>({1, 2, 0, 2})
-                            .addInput<float>({1})
-                            .addOutput<float>({0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1}));
+                          .addInput<int32_t>({1, 2, 0, 2})
+                          .addInput<float>({1})
+                          .addOutput<float>({0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -60,10 +60,10 @@ TEST_F(GenModelTest, OneOp_OneHot_OffValueToNotZero)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<int32_t>({1, 2, 0, 2})
-                            .addInput<float>({1})
-                            .addInput<float>({-1})
-                            .addOutput<float>({-1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1}));
+                          .addInput<int32_t>({1, 2, 0, 2})
+                          .addInput<float>({1})
+                          .addInput<float>({-1})
+                          .addOutput<float>({-1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -87,9 +87,9 @@ TEST_F(GenModelTest, OneOp_OneHot_IndicesValueToNeg_OffValueToConst)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<int32_t>({1, 2, 0, -1})
-                            .addInput<float>({1})
-                            .addOutput<float>({0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}));
+                          .addInput<int32_t>({1, 2, 0, -1})
+                          .addInput<float>({1})
+                          .addOutput<float>({0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -111,10 +111,10 @@ TEST_F(GenModelTest, OneOp_OneHot_IndicesValueToNeg_OffValueToVar)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<int32_t>({1, 2, 0, -1})
-                            .addInput<float>({1})
-                            .addInput<float>({0})
-                            .addOutput<float>({0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}));
+                          .addInput<int32_t>({1, 2, 0, -1})
+                          .addInput<float>({1})
+                          .addInput<float>({0})
+                          .addOutput<float>({0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Pad.cc b/tests/nnfw_api/src/one_op_tests/Pad.cc
index 63d02ab..380c1a3 100644
--- a/tests/nnfw_api/src/one_op_tests/Pad.cc
+++ b/tests/nnfw_api/src/one_op_tests/Pad.cc
@@ -29,7 +29,7 @@ TEST_F(GenModelTest, OneOp_Pad)
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}}));
+    uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/PadV2.cc b/tests/nnfw_api/src/one_op_tests/PadV2.cc
index e613fe2..f9fe5f6 100644
--- a/tests/nnfw_api/src/one_op_tests/PadV2.cc
+++ b/tests/nnfw_api/src/one_op_tests/PadV2.cc
@@ -26,7 +26,7 @@ TEST_F(GenModelTest, OneOp_PadV2)
   std::vector<float> padding_value_data{3.0};
   uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
   int padding_value =
-      cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
+    cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
 
   int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
 
@@ -35,7 +35,7 @@ TEST_F(GenModelTest, OneOp_PadV2)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 2, 3, 4}}, {{3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3}}));
+    uniformTCD<float>({{1, 2, 3, 4}}, {{3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -51,7 +51,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadRank)
   std::vector<float> padding_value_data{3.0};
   uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
   int padding_value =
-      cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
+    cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
 
   int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
 
@@ -75,7 +75,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadDim0)
   std::vector<float> padding_value_data{3.0};
   uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
   int padding_value =
-      cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
+    cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
 
   int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
 
@@ -99,7 +99,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadDim1)
   std::vector<float> padding_value_data{3.0};
   uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
   int padding_value =
-      cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
+    cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, padding_value_buf});
 
   int out = cgen.addTensor({{2, 2, 2, 2}, circle::TensorType::TensorType_FLOAT32});
 
diff --git a/tests/nnfw_api/src/one_op_tests/Rank.cc b/tests/nnfw_api/src/one_op_tests/Rank.cc
index 02e76ba..60ec193 100644
--- a/tests/nnfw_api/src/one_op_tests/Rank.cc
+++ b/tests/nnfw_api/src/one_op_tests/Rank.cc
@@ -27,9 +27,9 @@ TEST_F(GenModelTest, OneOp_Rank)
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}
-          .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
-          .addOutput<int32_t>({4}));
+    TestCaseData{}
+      .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
+      .addOutput<int32_t>({4}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -45,8 +45,8 @@ TEST_F(GenModelTest, OneOp_Rank_Int32)
   cgen.addOperatorRank({{in}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<int32_t>(
-      {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}}, {{4}}));
+  _context->addTestCase(
+    uniformTCD<int32_t>({{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}}, {{4}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
index 437cfd1..20320a0 100644
--- a/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
+++ b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
@@ -30,8 +30,8 @@ TEST_F(GenModelTest, OneOp_ResizeBilinear_SizeToConst)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<float>(
-      {{1, 1, 2, 2}}, {{1, 1, 1, 1.666666667, 1.666666667, 1.666666667, 2, 2, 2}}));
+  _context->addTestCase(
+    uniformTCD<float>({{1, 1, 2, 2}}, {{1, 1, 1, 1.666666667, 1.666666667, 1.666666667, 2, 2, 2}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc b/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
index d1617c3..1dd6584 100644
--- a/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
+++ b/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
@@ -31,8 +31,8 @@ TEST_F(GenModelTest, OneOp_ResizeNearestNeighbor)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{3, 4, 6, 10, 9, 10, 12, 16}},
-                        {{3, 4, 3, 4, 6, 10, 3, 4, 3, 4, 6, 10, 9, 10, 9, 10, 12, 16}}));
+    uniformTCD<float>({{3, 4, 6, 10, 9, 10, 12, 16}},
+                      {{3, 4, 3, 4, 6, 10, 3, 4, 3, 4, 6, 10, 9, 10, 9, 10, 12, 16}}));
   _context->setBackends({"acl_cl"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Reverse.cc b/tests/nnfw_api/src/one_op_tests/Reverse.cc
index ef0c5fe..4168b21 100644
--- a/tests/nnfw_api/src/one_op_tests/Reverse.cc
+++ b/tests/nnfw_api/src/one_op_tests/Reverse.cc
@@ -32,8 +32,8 @@ TEST_F(GenModelTest, OneOp_ReverseV2_3D)
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "cpu"});
   _context->addTestCase(uniformTCD<float>(
-      {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}},
-      {{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8, 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}}));
+    {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}},
+    {{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8, 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}}));
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Shape.cc b/tests/nnfw_api/src/one_op_tests/Shape.cc
index 9a48aa7..2a73db9 100644
--- a/tests/nnfw_api/src/one_op_tests/Shape.cc
+++ b/tests/nnfw_api/src/one_op_tests/Shape.cc
@@ -27,9 +27,9 @@ TEST_F(GenModelTest, OneOp_Shape)
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}
-          .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
-          .addOutput<int32_t>({1, 3, 3, 2}));
+    TestCaseData{}
+      .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
+      .addOutput<int32_t>({1, 3, 3, 2}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -46,9 +46,9 @@ TEST_F(GenModelTest, OneOp_Shape_Int64)
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      TestCaseData{}
-          .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
-          .addOutput<int64_t>({1, 3, 3, 2}));
+    TestCaseData{}
+      .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18})
+      .addOutput<int64_t>({1, 3, 3, 2}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Split.cc b/tests/nnfw_api/src/one_op_tests/Split.cc
index 2120164..32be9a7 100644
--- a/tests/nnfw_api/src/one_op_tests/Split.cc
+++ b/tests/nnfw_api/src/one_op_tests/Split.cc
@@ -32,7 +32,7 @@ TEST_F(GenModelTest, OneOp_Split)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6}, {3, 4, 7, 8}}));
+    uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6}, {3, 4, 7, 8}}));
   _context->setBackends({"cpu", "acl_cl", "acl_neon"});
 
   SUCCEED();
@@ -52,10 +52,10 @@ TEST_F(GenModelTest, OneOp_SplitNonConstAxis)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<int32_t>({1})
-                            .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8})
-                            .addOutput<float>({1, 2, 5, 6})
-                            .addOutput<float>({3, 4, 7, 8}));
+                          .addInput<int32_t>({1})
+                          .addInput<float>({1, 2, 3, 4, 5, 6, 7, 8})
+                          .addOutput<float>({1, 2, 5, 6})
+                          .addOutput<float>({3, 4, 7, 8}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Sqrt.cc b/tests/nnfw_api/src/one_op_tests/Sqrt.cc
new file mode 100644
index 0000000..01f3133
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Sqrt.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+CircleGen genSimpleSqrtModel(circle::TensorType type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, type});
+  int out = cgen.addTensor({{1, 2, 2, 1}, type});
+  cgen.addOperatorSqrt({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen;
+}
+
+TEST_F(GenModelTest, OneOp_Sqrt_f32)
+{
+  CircleGen cgen = genSimpleSqrtModel(circle::TensorType::TensorType_FLOAT32);
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}.addInput<float>({1, 4, 9, 16}).addOutput<float>({1, 2, 3, 4}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Sqrt_i32)
+{
+  CircleGen cgen = genSimpleSqrtModel(circle::TensorType::TensorType_INT32);
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(TestCaseData{}.addInput<int>({1, 4, 9, 16}).addOutput<float>({1, 2, 3, 4}));
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Square.cc b/tests/nnfw_api/src/one_op_tests/Square.cc
new file mode 100644
index 0000000..2ec9bad
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Square.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+CircleGen genSimpleSquareModel(circle::TensorType type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, type});
+  int out = cgen.addTensor({{1, 2, 2, 1}, type});
+  cgen.addOperatorSquare({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen;
+}
+
+TEST_F(GenModelTest, OneOp_Square_f32)
+{
+  CircleGen cgen = genSimpleSquareModel(circle::TensorType::TensorType_FLOAT32);
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}.addInput<float>({1, 2, 3, 4}).addOutput<float>({1, 4, 9, 16}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Square_i32)
+{
+  CircleGen cgen = genSimpleSquareModel(circle::TensorType::TensorType_INT32);
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(TestCaseData{}.addInput<int>({1, 2, 3, 4}).addOutput<float>({1, 4, 9, 16}));
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Tile.cc b/tests/nnfw_api/src/one_op_tests/Tile.cc
index aa36ba2..3f193d5 100644
--- a/tests/nnfw_api/src/one_op_tests/Tile.cc
+++ b/tests/nnfw_api/src/one_op_tests/Tile.cc
@@ -29,7 +29,7 @@ TEST_F(GenModelTest, OneOp_Tile_ConstMul)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(
-      uniformTCD<float>({{1, 2, 3, 4, 5, 6}}, {{1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}}));
+    uniformTCD<float>({{1, 2, 3, 4, 5, 6}}, {{1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -47,10 +47,10 @@ TEST_F(GenModelTest, OneOp_Tile_MulToConst)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<float>(
-      {{11, 12, 13, 21, 22, 23}},
-      {{11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
-        11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23}}));
+  _context->addTestCase(
+    uniformTCD<float>({{11, 12, 13, 21, 22, 23}},
+                      {{11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
+                        11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -66,12 +66,12 @@ TEST_F(GenModelTest, OneOp_Tile_MulToVar)
   cgen.setInputsAndOutputs({in, multiplies}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(TestCaseData{}
-                            .addInput<float>({11, 12, 13, 21, 22, 23})
-                            .addInput<int32_t>({2, 3, 1})
-                            .addOutput<float>({11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
-                                               11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
-                                               11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23}));
+  _context->addTestCase(
+    TestCaseData{}
+      .addInput<float>({11, 12, 13, 21, 22, 23})
+      .addInput<int32_t>({2, 3, 1})
+      .addOutput<float>({11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
+                         11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -88,9 +88,9 @@ TEST_F(GenModelTest, OneOp_Tile_VarMul)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<float>({1, 2, 3, 4, 5, 6})
-                            .addInput<int32_t>({1, 2})
-                            .addOutput<float>({1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}));
+                          .addInput<float>({1, 2, 3, 4, 5, 6})
+                          .addInput<int32_t>({1, 2})
+                          .addOutput<float>({1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Transpose.cc b/tests/nnfw_api/src/one_op_tests/Transpose.cc
index ecfb159..5a92c73 100644
--- a/tests/nnfw_api/src/one_op_tests/Transpose.cc
+++ b/tests/nnfw_api/src/one_op_tests/Transpose.cc
@@ -31,20 +31,19 @@ TEST_F(GenModelTest, OneOp_Transpose_PermsToConst)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(uniformTCD<float>(
-      {{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
-        18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
-        36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
-        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
-        72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-        90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
-        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119}},
-      {{0,  1,   2,   3,   4,   20,  21, 22,  23,  24,  40,  41,  42, 43,  44,  60,  61,  62,
-        63, 64,  80,  81,  82,  83,  84, 100, 101, 102, 103, 104, 5,  6,   7,   8,   9,   25,
-        26, 27,  28,  29,  45,  46,  47, 48,  49,  65,  66,  67,  68, 69,  85,  86,  87,  88,
-        89, 105, 106, 107, 108, 109, 10, 11,  12,  13,  14,  30,  31, 32,  33,  34,  50,  51,
-        52, 53,  54,  70,  71,  72,  73, 74,  90,  91,  92,  93,  94, 110, 111, 112, 113, 114,
-        15, 16,  17,  18,  19,  35,  36, 37,  38,  39,  55,  56,  57, 58,  59,  75,  76,  77,
-        78, 79,  95,  96,  97,  98,  99, 115, 116, 117, 118, 119}}));
+    {{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
+      18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+      36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
+      54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+      72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+      90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+      108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119}},
+    {{0,  1,  2,  3,  4,  20,  21,  22,  23,  24,  40, 41, 42, 43, 44, 60,  61,  62,  63,  64,
+      80, 81, 82, 83, 84, 100, 101, 102, 103, 104, 5,  6,  7,  8,  9,  25,  26,  27,  28,  29,
+      45, 46, 47, 48, 49, 65,  66,  67,  68,  69,  85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+      10, 11, 12, 13, 14, 30,  31,  32,  33,  34,  50, 51, 52, 53, 54, 70,  71,  72,  73,  74,
+      90, 91, 92, 93, 94, 110, 111, 112, 113, 114, 15, 16, 17, 18, 19, 35,  36,  37,  38,  39,
+      55, 56, 57, 58, 59, 75,  76,  77,  78,  79,  95, 96, 97, 98, 99, 115, 116, 117, 118, 119}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -61,9 +60,9 @@ TEST_F(GenModelTest, OneOp_Transpose_PermsToVar)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<float>({1, 2, 3, 4, 5, 6})
-                            .addInput<int32_t>({0, 2, 1, 3})
-                            .addOutput<float>({1, 4, 2, 5, 3, 6}));
+                          .addInput<float>({1, 2, 3, 4, 5, 6})
+                          .addInput<int32_t>({0, 2, 1, 3})
+                          .addOutput<float>({1, 4, 2, 5, 3, 6}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -80,9 +79,9 @@ TEST_F(GenModelTest, OneOp_Transpose_RegularTranspose)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->addTestCase(TestCaseData{}
-                            .addInput<float>({1, 2, 3, 4, 5, 6})
-                            .addInput<int32_t>({})
-                            .addOutput<float>({1, 4, 2, 5, 3, 6}));
+                          .addInput<float>({1, 2, 3, 4, 5, 6})
+                          .addInput<int32_t>({})
+                          .addOutput<float>({1, 4, 2, 5, 3, 6}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
index af79728..6799923 100755
--- a/tests/scripts/benchmark_nnapi.sh
+++ b/tests/scripts/benchmark_nnapi.sh
@@ -104,7 +104,7 @@ function profile_for_he_shed()
         $RUN_TEST_SH --driverbin=$BENCHMARK_DRIVER_BIN $MODEL > $LOG_FILE 2>&1
         RET=$?
         if [[ $RET -ne 0 ]]; then
-            echo "Profiling $MODEL aborted in run#$j... exit code: $RET"xX
+            echo "Profiling $MODEL aborted in run#$j... exit code: $RET"
             exit $RET
         fi
         echo "finished"
diff --git a/tests/scripts/test_scheduler_with_profiling.sh b/tests/scripts/test_scheduler_with_profiling.sh
index c34e836..639cf3f 100755
--- a/tests/scripts/test_scheduler_with_profiling.sh
+++ b/tests/scripts/test_scheduler_with_profiling.sh
@@ -82,7 +82,7 @@ function run_benchmark_test()
             $RUN_TEST_SH --driverbin=$BENCHMARK_DRIVER_BIN $MODEL > $LOG_FILE 2>&1
             RET=$?
             if [[ $RET -ne 0 ]]; then
-                echo "Profiling $MODEL aborted in run#$j... exit code: $RET"xX
+                echo "Profiling $MODEL aborted in run#$j... exit code: $RET"
                 exit $RET
             fi
             echo "finished"
diff --git a/tests/scripts/test_scheduler_with_profiling_android.sh b/tests/scripts/test_scheduler_with_profiling_android.sh
index 48576a9..8c12423 100644
--- a/tests/scripts/test_scheduler_with_profiling_android.sh
+++ b/tests/scripts/test_scheduler_with_profiling_android.sh
@@ -128,7 +128,7 @@ function run_benchmark_test()
             $SHELL_CMD $RUN_TEST_SH --driverbin=$BENCHMARK_DRIVER_BIN $MODEL > $LOG_FILE 2>&1
             RET=$?
             if [[ $RET -ne 0 ]]; then
-                echo "Profiling $MODEL aborted in run#$j... exit code: $RET"xX
+                echo "Profiling $MODEL aborted in run#$j... exit code: $RET"
                 exit $RET
             fi
             echo "finished"
diff --git a/tests/tools/nnpackage_run/src/allocation.h b/tests/tools/nnpackage_run/src/allocation.h
index ea4672f..e7f1a9c 100644
--- a/tests/tools/nnpackage_run/src/allocation.h
+++ b/tests/tools/nnpackage_run/src/allocation.h
@@ -29,6 +29,7 @@ public:
   ~Allocation() { free(data_); }
   void *data() const { return data_; }
   void *alloc(uint64_t sz) { return data_ = malloc(sz); }
+
 private:
   void *data_;
 };
diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
index 3929c8d..eeedcb7 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.cc
+++ b/tests/tools/nnpackage_run/src/h5formatter.cc
@@ -135,7 +135,7 @@ void H5Formatter::loadInputs(const std::string &filename, std::vector<Allocation
             data_set.read(inputs[i].data(), H5::PredType::NATIVE_UINT8);
           else
             throw std::runtime_error(
-                "model input type is qasymm8, bool or uint8. But h5 data type is different.");
+              "model input type is qasymm8, bool or uint8. But h5 data type is different.");
           break;
         default:
           throw std::runtime_error("nnpkg_run can load f32, i32, qasymm8, bool and uint8.");
@@ -188,21 +188,21 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
         case NNFW_TYPE_TENSOR_FLOAT32:
         {
           H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::IEEE_F32BE, data_space);
+            value_group.createDataSet(std::to_string(i), H5::PredType::IEEE_F32BE, data_space);
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_FLOAT);
           break;
         }
         case NNFW_TYPE_TENSOR_INT32:
         {
           H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_I32LE, data_space);
+            value_group.createDataSet(std::to_string(i), H5::PredType::STD_I32LE, data_space);
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT32);
           break;
         }
         case NNFW_TYPE_TENSOR_INT64:
         {
           H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_I64LE, data_space);
+            value_group.createDataSet(std::to_string(i), H5::PredType::STD_I64LE, data_space);
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
           break;
         }
@@ -210,14 +210,14 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
         case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
         {
           H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
+            value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
           break;
         }
         case NNFW_TYPE_TENSOR_BOOL:
         {
           H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8LE, data_space);
+            value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8LE, data_space);
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
           break;
         }
diff --git a/tests/tools/nnpackage_run/src/nnfw_util.cc b/tests/tools/nnpackage_run/src/nnfw_util.cc
index 01e72f9..6c37eed 100644
--- a/tests/tools/nnpackage_run/src/nnfw_util.cc
+++ b/tests/tools/nnpackage_run/src/nnfw_util.cc
@@ -34,12 +34,12 @@ uint64_t num_elems(const nnfw_tensorinfo *ti)
 uint64_t bufsize_for(const nnfw_tensorinfo *ti)
 {
   static int elmsize[] = {
-      sizeof(float),   /* NNFW_TYPE_TENSOR_FLOAT32 */
-      sizeof(int),     /* NNFW_TYPE_TENSOR_INT32 */
-      sizeof(uint8_t), /* NNFW_TYPE_TENSOR_QUANT8_ASYMM */
-      sizeof(bool),    /* NNFW_TYPE_TENSOR_BOOL = 3 */
-      sizeof(uint8_t), /* NNFW_TYPE_TENSOR_UINT8 = 4 */
-      sizeof(int64_t), /* NNFW_TYPE_TENSOR_INT64 = 5 */
+    sizeof(float),   /* NNFW_TYPE_TENSOR_FLOAT32 */
+    sizeof(int),     /* NNFW_TYPE_TENSOR_INT32 */
+    sizeof(uint8_t), /* NNFW_TYPE_TENSOR_QUANT8_ASYMM */
+    sizeof(bool),    /* NNFW_TYPE_TENSOR_BOOL = 3 */
+    sizeof(uint8_t), /* NNFW_TYPE_TENSOR_UINT8 = 4 */
+    sizeof(int64_t), /* NNFW_TYPE_TENSOR_INT64 = 5 */
 
   };
   return elmsize[ti->dtype] * num_elems(ti);
diff --git a/tests/tools/nnpackage_run/src/nnpackage_run.cc b/tests/tools/nnpackage_run/src/nnpackage_run.cc
index 64623a8..5bde74f 100644
--- a/tests/tools/nnpackage_run/src/nnpackage_run.cc
+++ b/tests/tools/nnpackage_run/src/nnpackage_run.cc
@@ -70,7 +70,7 @@ int main(const int argc, char **argv)
     // TODO Apply verbose level to phases
     const int verbose = args.getVerboseLevel();
     benchmark::Phases phases(
-        benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
+      benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
 
     nnfw_session *session = nullptr;
     NNPR_ENSURE_STATUS(nnfw_create_session(&session));
@@ -223,7 +223,7 @@ int main(const int argc, char **argv)
       }
       outputs[i].alloc(output_size_in_bytes);
       NNPR_ENSURE_STATUS(
-          nnfw_set_output(session, i, ti.dtype, outputs[i].data(), output_size_in_bytes));
+        nnfw_set_output(session, i, ti.dtype, outputs[i].data(), output_size_in_bytes));
       NNPR_ENSURE_STATUS(nnfw_set_output_layout(session, i, NNFW_LAYOUT_CHANNELS_LAST));
     }
 
@@ -231,31 +231,35 @@ int main(const int argc, char **argv)
     // only warmup.
     if (verbose == 0)
     {
-      phases.run("WARMUP",
-                 [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
-                 args.getWarmupRuns());
-      phases.run("EXECUTE",
-                 [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
-                 args.getNumRuns(), true);
+      phases.run(
+        "WARMUP",
+        [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
+        args.getWarmupRuns());
+      phases.run(
+        "EXECUTE",
+        [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
+        args.getNumRuns(), true);
     }
     else
     {
-      phases.run("WARMUP",
-                 [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
-                 [&](const benchmark::Phase &phase, uint32_t nth) {
-                   std::cout << "... "
-                             << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                             << std::endl;
-                 },
-                 args.getWarmupRuns());
-      phases.run("EXECUTE",
-                 [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
-                 [&](const benchmark::Phase &phase, uint32_t nth) {
-                   std::cout << "... "
-                             << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                             << std::endl;
-                 },
-                 args.getNumRuns(), true);
+      phases.run(
+        "WARMUP",
+        [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
+        [&](const benchmark::Phase &phase, uint32_t nth) {
+          std::cout << "... "
+                    << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
+                    << std::endl;
+        },
+        args.getWarmupRuns());
+      phases.run(
+        "EXECUTE",
+        [&](const benchmark::Phase &, uint32_t) { NNPR_ENSURE_STATUS(nnfw_run(session)); },
+        [&](const benchmark::Phase &phase, uint32_t nth) {
+          std::cout << "... "
+                    << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
+                    << std::endl;
+        },
+        args.getNumRuns(), true);
     }
 
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
diff --git a/tests/tools/nnpackage_run/src/randomgen.cc b/tests/tools/nnpackage_run/src/randomgen.cc
index 3432420..a1fcf82 100644
--- a/tests/tools/nnpackage_run/src/randomgen.cc
+++ b/tests/tools/nnpackage_run/src/randomgen.cc
@@ -66,7 +66,7 @@ void RandomGenerator::generate(std::vector<Allocation> &inputs)
         std::exit(-1);
     }
     NNPR_ENSURE_STATUS(
-        nnfw_set_input(session_, i, ti.dtype, inputs[i].data(), input_size_in_bytes));
+      nnfw_set_input(session_, i, ti.dtype, inputs[i].data(), input_size_in_bytes));
     NNPR_ENSURE_STATUS(nnfw_set_input_layout(session_, i, NNFW_LAYOUT_CHANNELS_LAST));
   }
 };
diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
index 0fe1c69..6be3158 100644
--- a/tests/tools/tflite_loader/CMakeLists.txt
+++ b/tests/tools/tflite_loader/CMakeLists.txt
@@ -16,7 +16,7 @@ nnfw_find_package(Boost REQUIRED program_options system filesystem)
 add_executable(tflite_loader_test_tool ${SOURCES})
 target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
 
-target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
+target_link_libraries(tflite_loader_test_tool nnfw-dev)
 target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
 target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
 
diff --git a/tests/tools/tflite_loader/src/tflite_loader.cc b/tests/tools/tflite_loader/src/tflite_loader.cc
index ce09921..f77570c 100644
--- a/tests/tools/tflite_loader/src/tflite_loader.cc
+++ b/tests/tools/tflite_loader/src/tflite_loader.cc
@@ -14,23 +14,20 @@
  * limitations under the License.
  */
 
-#include "tflite/ext/kernels/register.h"
-
 #include "args.h"
-#include "tflite/InterpreterSession.h"
-#include "tflite/Assert.h"
-#include "tflite/Diff.h"
-#include "misc/tensor/IndexIterator.h"
 
-#include <iostream>
-#include <fstream>
+#include <nnfw_experimental.h>
+#include <nnfw_internal.h>
 
-#include "compiler/Compiler.h"
-#include "exec/Execution.h"
-#include "ir/Graph.h"
+#include <misc/EnvVar.h>
+#include <misc/RandomGenerator.h>
 
-#include "tflite_loader.h"
+#include <tflite/Assert.h>
+#include <tflite/InterpreterSession.h>
+#include <tflite/ext/kernels/register.h>
 
+#include <iostream>
+#include <fstream>
 #include <memory>
 
 const int RUN_FAILED = 1;
@@ -41,8 +38,15 @@ using namespace nnfw::tflite;
 const int FILE_ERROR = 2;
 const float DIFFERENCE_THRESHOLD = 10e-5;
 
+#define NNFW_ASSERT_FAIL(expr, msg)   \
+  if ((expr) != NNFW_STATUS_NO_ERROR) \
+  {                                   \
+    std::cerr << msg << std::endl;    \
+    exit(-1);                         \
+  }
+
 // Read vector of floats from selected file
-std::vector<float> readData(const string &path)
+void readData(const string &path, std::vector<uint8_t> &dest)
 {
   std::ifstream in(path);
   if (!in.good())
@@ -53,100 +57,104 @@ std::vector<float> readData(const string &path)
   in.seekg(0, std::ifstream::end);
   size_t len = in.tellg();
   in.seekg(0, std::ifstream::beg);
-  assert(len % sizeof(float) == 0);
-  size_t size = len / sizeof(float);
-  std::vector<float> vec(size);
-  for (size_t i = 0; i < size; ++i)
+
+  assert(dest.size() == len);
+  in.read(reinterpret_cast<char *>(dest.data()), len);
+}
+
+template <typename T>
+void randomData(nnfw::misc::RandomGenerator &randgen, std::vector<uint8_t> &dest)
+{
+  size_t elements = dest.size() / sizeof(T);
+  assert(dest.size() % sizeof(T) == 0);
+
+  std::vector<T> vec(elements);
+  for (uint64_t i = 0; i < elements; i++)
   {
-    in.read(reinterpret_cast<char *>(&vec[i]), sizeof(float));
+    vec[i] = randgen.generate<T>();
   }
-  return vec;
+  memcpy(dest.data(), vec.data(), elements * sizeof(T));
 }
 
-std::vector<float> randomData(nnfw::misc::RandomGenerator &randgen, const uint64_t size)
+void randomBoolData(nnfw::misc::RandomGenerator &randgen, std::vector<uint8_t> &dest)
 {
-  std::vector<float> vec(size);
-  for (uint64_t i = 0; i < size; i++)
+  size_t elements = dest.size();
+  std::vector<uint8_t> vec(elements);
+  for (uint64_t i = 0; i < elements; i++)
   {
-    vec[i] = randgen.generate<float>();
+    bool value = randgen.generate<bool>();
+    dest[i] = value ? 1 : 0;
   }
-  return vec;
 }
 
-void executeGraph(const std::shared_ptr<onert::ir::Graph> &g,
-                  const std::vector<std::vector<float>> &inputs,
-                  std::vector<std::vector<float>> &outputs)
+inline uint64_t num_elems(const nnfw_tensorinfo *ti)
 {
-  auto subgs = std::make_shared<onert::ir::Subgraphs>();
-  subgs->push(onert::ir::SubgraphIndex{0}, g);
-  auto compiler = new onert::compiler::Compiler(subgs);
-  std::shared_ptr<onert::exec::ExecutorMap> executors;
-  // Compilation
-  try
+  uint64_t n = 1;
+  for (uint32_t i = 0; i < ti->rank; ++i)
   {
-    executors = compiler->compile();
+    n *= ti->dims[i];
   }
-  catch (const std::exception &e)
+  return n;
+}
+
+inline size_t sizeOfNnfwType(NNFW_TYPE type)
+{
+  switch (type)
   {
-    std::cerr << "[Execution] Can't compile model" << std::endl;
-    std::cerr << e.what() << std::endl;
-    exit(-1);
+    case NNFW_TYPE_TENSOR_BOOL:
+    case NNFW_TYPE_TENSOR_UINT8:
+    case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+    case NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED:
+      return 1;
+    case NNFW_TYPE_TENSOR_FLOAT32:
+    case NNFW_TYPE_TENSOR_INT32:
+      return 4;
+    case NNFW_TYPE_TENSOR_INT64:
+      return 8;
+    default:
+      throw std::runtime_error{"Invalid tensor type"};
   }
+}
 
-  std::cout << "[Execution] Graph compiled!" << std::endl;
-
-  auto execution = std::make_shared<onert::exec::Execution>(executors);
-
-  // Setting IO
-  try
+template <typename T>
+bool compareBuffersExact(const T *ref_buf, const std::vector<uint8_t> &act_buf, uint32_t index)
+{
+  bool match = true;
+  for (uint32_t e = 0; e < act_buf.size() / sizeof(T); e++)
   {
-    // Verify input shapes
-    auto num_inputs = inputs.size();
-    for (size_t i = 0; i < num_inputs; i++)
-    {
-      auto input_operand_idx = g->getInputs().at(i);
-      auto input_shape = g->operands().at(input_operand_idx).shape();
-      assert(inputs[i].size() == input_shape.num_elements());
-    }
+    T ref = ref_buf[e];
+    T act = reinterpret_cast<const T *>(act_buf.data())[e];
 
-    // Set output shapes
-    auto num_outputs = g->getOutputs().size();
-    outputs.resize(num_outputs);
-    for (uint32_t i = 0; i < num_outputs; i++)
+    if (ref != act)
     {
-      auto output_operand_idx = g->getOutputs().at(i);
-      auto output_shape = g->operands().at(output_operand_idx).shape();
-      outputs[i].resize(output_shape.num_elements());
+      std::cerr << "Output #" << index << ", Element Index : " << e << ", ref: " << ref
+                << ", act: " << act << std::endl;
+      match = false;
     }
-
-    for (size_t i = 0; i < num_inputs; i++)
-      execution->setInput(onert::ir::IOIndex(i), inputs[i].data(),
-                          inputs[i].size() * sizeof(float));
-    for (uint32_t i = 0; i < num_outputs; i++)
-      execution->setOutput(onert::ir::IOIndex(i), outputs[i].data(),
-                           outputs[i].size() * sizeof(float));
-  }
-  catch (const std::exception &e)
-  {
-    std::cerr << "[Execution] Can't set model IO" << std::endl;
-    std::cerr << e.what() << '\n';
-    exit(-1);
   }
 
-  try
-  {
-    execution->execute();
-  }
-  catch (const std::exception &e)
+  return match;
+}
+
+bool compareBuffersExactBool(const uint8_t *ref_buf, const std::vector<uint8_t> &act_buf,
+                             uint32_t index)
+{
+  bool match = true;
+  for (uint32_t e = 0; e < act_buf.size() / sizeof(uint8_t); e++)
   {
-    std::cerr << "[Execution] Can't execute" << std::endl;
-    std::cerr << e.what() << '\n';
-    exit(-1);
+    uint8_t ref_raw = ref_buf[e];
+    bool ref = (ref_raw != 0 ? true : false);
+    uint8_t act_raw = reinterpret_cast<const uint8_t *>(act_buf.data())[e];
+    bool act = (act_raw != 0 ? true : false);
+    if (ref != act)
+    {
+      std::cerr << "Output #" << index << ", Element Index : " << e << ", ref: " << ref
+                << ", act: " << act << std::endl;
+      match = false;
+    }
   }
 
-  std::cout << "[Execution] Done!" << std::endl;
-
-  delete compiler;
+  return match;
 }
 
 int main(const int argc, char **argv)
@@ -163,44 +171,38 @@ int main(const int argc, char **argv)
   }
 
   std::cout << "[Execution] Stage start!" << std::endl;
-  std::shared_ptr<onert::ir::Graph> test_graph;
   // Loading
-  try
+  nnfw_session *onert_session = nullptr;
+  NNFW_ASSERT_FAIL(nnfw_create_session(&onert_session), "[ ERROR ] Failure during model load");
+  if (onert_session == nullptr)
   {
-    test_graph =
-        onert::tflite_loader::loadModel(tflite_file.c_str())->at(onert::ir::SubgraphIndex{0});
-  }
-  catch (std::exception &e)
-  {
-    std::cerr << "[ ERROR ] "
-              << "Failure during model load" << std::endl;
-    std::cerr << e.what() << std::endl;
+    std::cerr << "[ ERROR ] Failure to open session" << std::endl;
     exit(-1);
   }
 
-  // TODO: Support another input/output types
-  for (const auto &input_idx : test_graph->getInputs())
-  {
-    const auto input_type = test_graph->operands().at(input_idx).typeInfo().type();
-    assert(input_type == onert::ir::DataType::FLOAT32 && "Only FLOAT32 inputs are supported");
-  }
-  for (const auto &output_idx : test_graph->getOutputs())
-  {
-    const auto output_type = test_graph->operands().at(output_idx).typeInfo().type();
-    assert(output_type == onert::ir::DataType::FLOAT32 && "Only FLOAT32 outputs are supported");
-  }
+  NNFW_ASSERT_FAIL(nnfw_load_model_from_modelfile(onert_session, tflite_file.c_str()),
+                   "[ ERROR ] Failure during model load");
+
+  uint32_t num_inputs;
+  uint32_t num_outputs;
+  NNFW_ASSERT_FAIL(nnfw_input_size(onert_session, &num_inputs),
+                   "[ ERROR ] Failure during get model inputs");
+  NNFW_ASSERT_FAIL(nnfw_output_size(onert_session, &num_outputs),
+                   "[ ERROR ] Failure during get model outputs");
 
   std::cout << "[Execution] Model is deserialized!" << std::endl;
-  auto num_inputs = test_graph->getInputs().size();
-  std::vector<std::vector<float>> inputs(num_inputs);
+
+  // Compile
+  nnfw_prepare(onert_session);
+
+  std::cout << "[Execution] Model compiled!" << std::endl;
+
+  // Prepare input/output data
+  std::vector<std::vector<uint8_t>> inputs(num_inputs);
+  std::vector<std::vector<uint8_t>> outputs(num_outputs);
+
   bool generate_data = data_files.empty();
   bool read_data = data_files.size() == num_inputs;
-  if (num_inputs == 0)
-  {
-    std::cerr << "[ ERROR ] "
-              << "No inputs in model => execution is not possible" << std::endl;
-    exit(1);
-  }
   if (!generate_data && !read_data)
   {
     std::cerr << "[ ERROR ] "
@@ -210,32 +212,75 @@ int main(const int argc, char **argv)
 
   const int seed = 1; /* TODO Add an option for seed value */
   nnfw::misc::RandomGenerator randgen{seed, 0.0f, 2.0f};
-  try
+
+  for (uint32_t i = 0; i < num_inputs; i++)
   {
-    for (uint32_t i = 0; i < num_inputs; i++)
+    nnfw_tensorinfo ti_input;
+    NNFW_ASSERT_FAIL(nnfw_input_tensorinfo(onert_session, i, &ti_input),
+                     "[ ERROR ] Failure during get input data info");
+    size_t input_size = num_elems(&ti_input) * sizeOfNnfwType(ti_input.dtype);
+
+    inputs[i].resize(input_size);
+
+    if (generate_data)
     {
-      if (generate_data)
+      switch (ti_input.dtype)
       {
-        uint64_t sz =
-            test_graph->operands().at(test_graph->getInputs().at(i)).shape().num_elements();
-        inputs[i] = randomData(randgen, sz);
+        case NNFW_TYPE_TENSOR_BOOL:
+          randomBoolData(randgen, inputs[i]);
+          break;
+        case NNFW_TYPE_TENSOR_UINT8:
+        case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+          randomData<uint8_t>(randgen, inputs[i]);
+          break;
+        case NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED:
+          randomData<int8_t>(randgen, inputs[i]);
+          break;
+        case NNFW_TYPE_TENSOR_FLOAT32:
+          randomData<float>(randgen, inputs[i]);
+          break;
+        case NNFW_TYPE_TENSOR_INT32:
+          randomData<int32_t>(randgen, inputs[i]);
+          break;
+        case NNFW_TYPE_TENSOR_INT64:
+          randomData<uint64_t>(randgen, inputs[i]);
+          break;
+        default:
+          std::cerr << "[ ERROR ] "
+                    << "Unspported input data type" << std::endl;
+          exit(-1);
+          break;
       }
-      else /* read_data */
-        inputs[i] = readData(data_files[i]);
     }
+    else /* read_data */
+      readData(data_files[i], inputs[i]);
+
+    NNFW_ASSERT_FAIL(nnfw_set_input(onert_session, i, ti_input.dtype, inputs[i].data(), input_size),
+                     "[ ERROR ] Failure to set input tensor buffer");
   }
-  catch (std::exception &e)
+
+  std::cout << "[Execution] Input data is defined!" << std::endl;
+
+  for (uint32_t i = 0; i < num_outputs; i++)
   {
-    std::cerr << "[ ERROR ] "
-              << "Failure during input data generation" << std::endl;
-    std::cerr << e.what() << std::endl;
-    exit(-1);
+    nnfw_tensorinfo ti_output;
+    NNFW_ASSERT_FAIL(nnfw_output_tensorinfo(onert_session, i, &ti_output),
+                     "[ ERROR ] Failure during get output tensor info");
+
+    uint64_t output_elements = num_elems(&ti_output);
+    size_t output_size = output_elements * sizeOfNnfwType(ti_output.dtype);
+    outputs[i].resize(output_size);
+
+    NNFW_ASSERT_FAIL(
+      nnfw_set_output(onert_session, i, ti_output.dtype, outputs[i].data(), output_size),
+      "[ ERROR ] Failure to set output tensor buffer");
   }
 
-  std::cout << "[Execution] Input data is defined!" << std::endl;
-  std::vector<std::vector<float>> outputs;
-  // Run graph
-  executeGraph(test_graph, inputs, outputs);
+  // Execute
+  NNFW_ASSERT_FAIL(nnfw_run(onert_session), "[Execution] Can't execute");
+
+  std::cout << "[Execution] Done!" << std::endl;
+
   // Compare with tflite
   std::cout << "[Comparison] Stage start!" << std::endl;
   // Read tflite model
@@ -255,7 +300,7 @@ int main(const int argc, char **argv)
     std::cerr << e.what() << std::endl;
     exit(FILE_ERROR);
   }
-  interpreter->SetNumThreads(2);
+  interpreter->SetNumThreads(nnfw::misc::EnvVar("THREAD").asInt(-1));
 
   auto sess = std::make_shared<nnfw::tflite::InterpreterSession>(interpreter.get());
   sess->prepare();
@@ -263,7 +308,7 @@ int main(const int argc, char **argv)
   for (uint32_t i = 0; i < num_inputs; i++)
   {
     auto input_tensor = interpreter->tensor(interpreter->inputs().at(i));
-    memcpy(input_tensor->data.f, inputs[i].data(), inputs[i].size() * sizeof(float));
+    memcpy(input_tensor->data.uint8, inputs[i].data(), inputs[i].size());
   }
   if (!sess->run())
   {
@@ -273,32 +318,69 @@ int main(const int argc, char **argv)
   std::cout << "[Comparison] TFLite run done!" << std::endl;
 
   // Calculate max difference over all outputs
-  float max_difference = 0.0f;
-  auto num_outputs = test_graph->getOutputs().size();
+  float max_float_difference = 0.0f;
+  bool find_unmatched_output = false;
+
   for (uint32_t out_idx = 0; out_idx < num_outputs; out_idx++)
   {
-    const auto &tflite_output_tensor = interpreter->tensor(interpreter->outputs().at(out_idx));
-    const auto &nnfw_output_tensor = outputs[out_idx];
-
-    if (nnfw_output_tensor.size() != tflite_output_tensor->bytes / sizeof(float))
-      std::cout << "[Comparison] Different size of outputs!" << std::endl;
-    // Check max difference
-    float *tflite_out_ptr = tflite_output_tensor->data.f;
-    for (const auto &nnfw_out : nnfw_output_tensor)
-    {
-      if (std::abs(nnfw_out - *tflite_out_ptr) > max_difference)
-        max_difference = std::abs(nnfw_out - *tflite_out_ptr);
+    nnfw_tensorinfo ti;
+    nnfw_output_tensorinfo(onert_session, out_idx, &ti);
+
+    bool matched = true;
+    // Check output tensor values
+
+    const auto &ref_output = interpreter->tensor(interpreter->outputs().at(out_idx))->data;
+    const auto &output = outputs[out_idx];
 
-      tflite_out_ptr++;
+    switch (ti.dtype)
+    {
+      case NNFW_TYPE_TENSOR_BOOL:
+        matched = compareBuffersExactBool(ref_output.uint8, output, out_idx);
+        break;
+      case NNFW_TYPE_TENSOR_UINT8:
+      case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+        matched = compareBuffersExact<uint8_t>(ref_output.uint8, output, out_idx);
+        break;
+      case NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED:
+        matched = compareBuffersExact<int8_t>(ref_output.int8, output, out_idx);
+        break;
+      case NNFW_TYPE_TENSOR_INT32:
+        matched = compareBuffersExact<int32_t>(ref_output.i32, output, out_idx);
+        break;
+      case NNFW_TYPE_TENSOR_FLOAT32:
+        // TODO better way for handling FP error?
+        for (uint32_t e = 0; e < num_elems(&ti); e++)
+        {
+          float refval = ref_output.f[e];
+          float val = reinterpret_cast<const float *>(output.data())[e];
+          if (std::abs(refval - val) > max_float_difference)
+            max_float_difference = std::abs(refval - val);
+
+          if (max_float_difference > DIFFERENCE_THRESHOLD)
+            matched = false;
+        }
+        break;
+      case NNFW_TYPE_TENSOR_INT64:
+        matched = compareBuffersExact<int64_t>(ref_output.i64, output, out_idx);
+        break;
+      default:
+        throw std::runtime_error{"Invalid tensor type"};
     }
+
+    if (!matched)
+      find_unmatched_output = true;
   }
 
   // Print results
-  std::cout << "[Comparison] Max difference: " << max_difference << std::endl;
+  std::cout << "[Comparison] Max float difference: " << max_float_difference << std::endl;
   int ret = 0;
-  if (max_difference > DIFFERENCE_THRESHOLD)
+  if (find_unmatched_output)
   {
-    std::cout << "[Comparison] Outputs is not equal!" << std::endl;
+    std::cout << "[Comparison] outputs is not equal!" << std::endl;
+    if (max_float_difference > DIFFERENCE_THRESHOLD)
+    {
+      std::cout << "[Comparison] Float outputs is not equal!" << std::endl;
+    }
     ret = 1;
   }
   else
@@ -307,5 +389,7 @@ int main(const int argc, char **argv)
   }
   std::cout << "[Comparison] Done!" << std::endl;
 
+  nnfw_close_session(onert_session);
+
   return ret;
 }
diff --git a/tests/tools/tflite_run/src/bin_image.cc b/tests/tools/tflite_run/src/bin_image.cc
index 16d4c94..fadece0 100644
--- a/tests/tools/tflite_run/src/bin_image.cc
+++ b/tests/tools/tflite_run/src/bin_image.cc
@@ -20,7 +20,7 @@
 #include "bin_image.h"
 
 BinImage::BinImage(unsigned int width, unsigned int height, unsigned int channels)
-    : _width(width), _height(height), _channels(channels)
+  : _width(width), _height(height), _channels(channels)
 {
 }
 
diff --git a/tests/tools/tflite_run/src/tensor_loader.cc b/tests/tools/tflite_run/src/tensor_loader.cc
index 93d9e2f..a1a9433 100644
--- a/tests/tools/tflite_run/src/tensor_loader.cc
+++ b/tests/tools/tflite_run/src/tensor_loader.cc
@@ -26,7 +26,7 @@ namespace TFLiteRun
 {
 
 TensorLoader::TensorLoader(tflite::Interpreter &interpreter)
-    : _interpreter(interpreter), _raw_data(nullptr)
+  : _interpreter(interpreter), _raw_data(nullptr)
 {
 }
 
diff --git a/tests/tools/tflite_run/src/tflite_run.cc b/tests/tools/tflite_run/src/tflite_run.cc
index e72966d..d42f992 100644
--- a/tests/tools/tflite_run/src/tflite_run.cc
+++ b/tests/tools/tflite_run/src/tflite_run.cc
@@ -86,7 +86,7 @@ int main(const int argc, char **argv)
   // TODO Apply verbose level to phases
   const int verbose = args.getVerboseLevel();
   benchmark::Phases phases(
-      benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
+    benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
 
   std::unique_ptr<FlatBufferModel> model;
   std::unique_ptr<Interpreter> interpreter;
@@ -156,7 +156,7 @@ int main(const int argc, char **argv)
       for (uint32_t axis = 0; axis < tensor->dims->size; axis++, offset++)
       {
         new_dim[axis] =
-            ((offset < dim_values) ? args.getInputShapes()[offset] : tensor->dims->data[axis]);
+          ((offset < dim_values) ? args.getInputShapes()[offset] : tensor->dims->data[axis]);
       }
 
       interpreter->ResizeInputTensor(id, new_dim);
@@ -208,12 +208,12 @@ int main(const int argc, char **argv)
         int32_t value = 0;
 
         nnfw::misc::tensor::iterate(tensor_view.shape())
-            << [&](const nnfw::misc::tensor::Index &ind) {
-                 // TODO Generate random values
-                 // Gather operation: index should be within input coverage.
-                 tensor_view.at(ind) = value;
-                 value++;
-               };
+          << [&](const nnfw::misc::tensor::Index &ind) {
+               // TODO Generate random values
+               // Gather operation: index should be within input coverage.
+               tensor_view.at(ind) = value;
+               value++;
+             };
       }
       else if (tensor->type == kTfLiteUInt8)
       {
@@ -221,16 +221,16 @@ int main(const int argc, char **argv)
         auto tensor_view = nnfw::tflite::TensorView<uint8_t>::make(*interpreter, o);
 
         auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-            const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-            &nnfw::misc::RandomGenerator::generate<uint8_t>);
+          const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+          &nnfw::misc::RandomGenerator::generate<uint8_t>);
         const nnfw::misc::tensor::Object<uint8_t> data(tensor_view.shape(),
                                                        std::bind(fp, randgen, _1, _2));
 
         nnfw::misc::tensor::iterate(tensor_view.shape())
-            << [&](const nnfw::misc::tensor::Index &ind) {
-                 const auto value = data.at(ind);
-                 tensor_view.at(ind) = value;
-               };
+          << [&](const nnfw::misc::tensor::Index &ind) {
+               const auto value = data.at(ind);
+               tensor_view.at(ind) = value;
+             };
       }
       else if (tensor->type == kTfLiteBool)
       {
@@ -238,16 +238,16 @@ int main(const int argc, char **argv)
         auto tensor_view = nnfw::tflite::TensorView<bool>::make(*interpreter, o);
 
         auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-            const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-            &nnfw::misc::RandomGenerator::generate<bool>);
+          const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+          &nnfw::misc::RandomGenerator::generate<bool>);
         const nnfw::misc::tensor::Object<bool> data(tensor_view.shape(),
                                                     std::bind(fp, randgen, _1, _2));
 
         nnfw::misc::tensor::iterate(tensor_view.shape())
-            << [&](const nnfw::misc::tensor::Index &ind) {
-                 const auto value = data.at(ind);
-                 tensor_view.at(ind) = value;
-               };
+          << [&](const nnfw::misc::tensor::Index &ind) {
+               const auto value = data.at(ind);
+               tensor_view.at(ind) = value;
+             };
       }
       else
       {
@@ -277,27 +277,28 @@ int main(const int argc, char **argv)
   // only warmup.
   if (verbose == 0)
   {
-    phases.run("WARMUP", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
-               args.getWarmupRuns());
-    phases.run("EXECUTE", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
-               args.getNumRuns(), true);
+    phases.run(
+      "WARMUP", [&](const benchmark::Phase &, uint32_t) { sess->run(); }, args.getWarmupRuns());
+    phases.run(
+      "EXECUTE", [&](const benchmark::Phase &, uint32_t) { sess->run(); }, args.getNumRuns(), true);
   }
   else
   {
-    phases.run("WARMUP", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
-               [&](const benchmark::Phase &phase, uint32_t nth) {
-                 std::cout << "... "
-                           << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                           << std::endl;
-               },
-               args.getWarmupRuns());
-    phases.run("EXECUTE", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
-               [&](const benchmark::Phase &phase, uint32_t nth) {
-                 std::cout << "... "
-                           << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                           << std::endl;
-               },
-               args.getNumRuns(), true);
+    phases.run(
+      "WARMUP", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
+      [&](const benchmark::Phase &phase, uint32_t nth) {
+        std::cout << "... "
+                  << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
+                  << std::endl;
+      },
+      args.getWarmupRuns());
+    phases.run(
+      "EXECUTE", [&](const benchmark::Phase &, uint32_t) { sess->run(); },
+      [&](const benchmark::Phase &phase, uint32_t nth) {
+        std::cout << "... "
+                  << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms" << std::endl;
+      },
+      args.getNumRuns(), true);
   }
 
   sess->teardown();
diff --git a/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
index d44ea60..e9fb04c 100644
--- a/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
+++ b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
@@ -86,7 +86,7 @@ int main(const int argc, char **argv)
   // TODO Apply verbose level to phases
   const int verbose = args.getVerboseLevel();
   benchmark::Phases phases(
-      benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
+    benchmark::PhaseOption{args.getMemoryPoll(), args.getGpuMemoryPoll(), args.getRunDelay()});
 
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
@@ -102,8 +102,8 @@ int main(const int argc, char **argv)
       }
       else
       {
-        model = tflite::FlatBufferModel::BuildFromFile(args.getTFLiteFilename().c_str(),
-                                                       &error_reporter);
+        model =
+          tflite::FlatBufferModel::BuildFromFile(args.getTFLiteFilename().c_str(), &error_reporter);
       }
       if (model == nullptr)
       {
@@ -153,12 +153,12 @@ int main(const int argc, char **argv)
       int32_t value = 0;
 
       nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               // TODO Generate random values
-               // Gather operation: index should be within input coverage.
-               tensor_view.at(ind) = value;
-               value++;
-             };
+        << [&](const nnfw::misc::tensor::Index &ind) {
+             // TODO Generate random values
+             // Gather operation: index should be within input coverage.
+             tensor_view.at(ind) = value;
+             value++;
+           };
     }
     else if (tensor->type == kTfLiteUInt8)
     {
@@ -168,11 +168,11 @@ int main(const int argc, char **argv)
       uint8_t value = 0;
 
       nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               // TODO Generate random values
-               tensor_view.at(ind) = value;
-               value = (value + 1) & 0xFF;
-             };
+        << [&](const nnfw::misc::tensor::Index &ind) {
+             // TODO Generate random values
+             tensor_view.at(ind) = value;
+             value = (value + 1) & 0xFF;
+           };
     }
     else if (tensor->type == kTfLiteBool)
     {
@@ -180,16 +180,16 @@ int main(const int argc, char **argv)
       auto tensor_view = TFLiteVanillaRun::TensorView<bool>::make(*interpreter, o);
 
       auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-          const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-          &nnfw::misc::RandomGenerator::generate<bool>);
+        const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
+        &nnfw::misc::RandomGenerator::generate<bool>);
       const nnfw::misc::tensor::Object<bool> data(tensor_view.shape(),
                                                   std::bind(fp, randgen, _1, _2));
 
       nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               const auto value = data.at(ind);
-               tensor_view.at(ind) = value;
-             };
+        << [&](const nnfw::misc::tensor::Index &ind) {
+             const auto value = data.at(ind);
+             tensor_view.at(ind) = value;
+           };
     }
     else
     {
@@ -214,27 +214,30 @@ int main(const int argc, char **argv)
   // only warmup.
   if (verbose == 0)
   {
-    phases.run("WARMUP", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
-               args.getWarmupRuns());
-    phases.run("EXECUTE", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
-               args.getNumRuns(), true);
+    phases.run(
+      "WARMUP", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
+      args.getWarmupRuns());
+    phases.run(
+      "EXECUTE", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
+      args.getNumRuns(), true);
   }
   else
   {
-    phases.run("WARMUP", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
-               [&](const benchmark::Phase &phase, uint32_t nth) {
-                 std::cout << "... "
-                           << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                           << std::endl;
-               },
-               args.getWarmupRuns());
-    phases.run("EXECUTE", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
-               [&](const benchmark::Phase &phase, uint32_t nth) {
-                 std::cout << "... "
-                           << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
-                           << std::endl;
-               },
-               args.getNumRuns(), true);
+    phases.run(
+      "WARMUP", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
+      [&](const benchmark::Phase &phase, uint32_t nth) {
+        std::cout << "... "
+                  << "warmup " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms"
+                  << std::endl;
+      },
+      args.getWarmupRuns());
+    phases.run(
+      "EXECUTE", [&](const benchmark::Phase &, uint32_t) { interpreter->Invoke(); },
+      [&](const benchmark::Phase &phase, uint32_t nth) {
+        std::cout << "... "
+                  << "run " << nth + 1 << " takes " << phase.time[nth] / 1e3 << " ms" << std::endl;
+      },
+      args.getNumRuns(), true);
   }
 
   std::cout << "output tensor indices = [";
diff --git a/tools/.clang-format b/tools/.clang-format
new file mode 120000
index 0000000..0ff66f3
--- /dev/null
+++ b/tools/.clang-format
@@ -0,0 +1 @@
+../.clang-format.8
\ No newline at end of file
diff --git a/tools/cross/install_rootfs.sh b/tools/cross/install_rootfs.sh
index 223d675..5a65dac 100755
--- a/tools/cross/install_rootfs.sh
+++ b/tools/cross/install_rootfs.sh
@@ -29,7 +29,7 @@ __UbuntuPackages="build-essential"
 # other development supports
 __UbuntuPackages+=" ocl-icd-opencl-dev"
 __UbuntuPackages+=" libhdf5-dev"
-__UbuntuBoostPackages=" llibboost-all-dev"
+__UbuntuBoostPackages=" libboost-all-dev"
 
 # symlinks fixer
 __UbuntuPackages+=" symlinks"
diff --git a/tools/kbenchmark/kernels/acl_cl/Convolution.cpp b/tools/kbenchmark/kernels/acl_cl/Convolution.cpp
index 37d179a..31cda05 100644
--- a/tools/kbenchmark/kernels/acl_cl/Convolution.cpp
+++ b/tools/kbenchmark/kernels/acl_cl/Convolution.cpp
@@ -230,12 +230,11 @@ inline nonius::benchmark_registry &local_benchmark_registry()
 
 } // namespace
 
-#define NONIUS_LOCAL_BENCHMARK(name, ...)                                              \
-  namespace                                                                            \
-  {                                                                                    \
-  static ::nonius::benchmark_registrar                                                 \
-      NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, \
-                                                     __VA_ARGS__);                     \
+#define NONIUS_LOCAL_BENCHMARK(name, ...)                                                          \
+  namespace                                                                                        \
+  {                                                                                                \
+  static ::nonius::benchmark_registrar                                                             \
+    NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, __VA_ARGS__); \
   }
 
 NONIUS_LOCAL_BENCHMARK("CLDirectConvolutionLayer_NCHW", [](nonius::chronometer meter) {
diff --git a/tools/kbenchmark/kernels/acl_cl/TransposeConv.cpp b/tools/kbenchmark/kernels/acl_cl/TransposeConv.cpp
index 8278a61..c2ac305 100644
--- a/tools/kbenchmark/kernels/acl_cl/TransposeConv.cpp
+++ b/tools/kbenchmark/kernels/acl_cl/TransposeConv.cpp
@@ -207,12 +207,11 @@ inline nonius::benchmark_registry &local_benchmark_registry()
 
 } // namespace
 
-#define NONIUS_LOCAL_BENCHMARK(name, ...)                                              \
-  namespace                                                                            \
-  {                                                                                    \
-  static ::nonius::benchmark_registrar                                                 \
-      NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, \
-                                                     __VA_ARGS__);                     \
+#define NONIUS_LOCAL_BENCHMARK(name, ...)                                                          \
+  namespace                                                                                        \
+  {                                                                                                \
+  static ::nonius::benchmark_registrar                                                             \
+    NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, __VA_ARGS__); \
   }
 
 NONIUS_LOCAL_BENCHMARK("CLDeconvolutionLayer_NCHW", [](nonius::chronometer meter) {
diff --git a/tools/kbenchmark/kernels/acl_neon/Convolution.cpp b/tools/kbenchmark/kernels/acl_neon/Convolution.cpp
index 2d19cb2..1656186 100644
--- a/tools/kbenchmark/kernels/acl_neon/Convolution.cpp
+++ b/tools/kbenchmark/kernels/acl_neon/Convolution.cpp
@@ -223,12 +223,11 @@ inline nonius::benchmark_registry &local_benchmark_registry()
 
 } // namespace
 
-#define NONIUS_LOCAL_BENCHMARK(name, ...)                                              \
-  namespace                                                                            \
-  {                                                                                    \
-  static ::nonius::benchmark_registrar                                                 \
-      NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, \
-                                                     __VA_ARGS__);                     \
+#define NONIUS_LOCAL_BENCHMARK(name, ...)                                                          \
+  namespace                                                                                        \
+  {                                                                                                \
+  static ::nonius::benchmark_registrar                                                             \
+    NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, __VA_ARGS__); \
   }
 
 NONIUS_LOCAL_BENCHMARK("NEDirectConvolutionLayer_NCHW", [](nonius::chronometer meter) {
diff --git a/tools/kbenchmark/kernels/acl_neon/TransposeConv.cpp b/tools/kbenchmark/kernels/acl_neon/TransposeConv.cpp
index 0878499..892547d 100644
--- a/tools/kbenchmark/kernels/acl_neon/TransposeConv.cpp
+++ b/tools/kbenchmark/kernels/acl_neon/TransposeConv.cpp
@@ -199,12 +199,11 @@ inline nonius::benchmark_registry &local_benchmark_registry()
 
 } // namespace
 
-#define NONIUS_LOCAL_BENCHMARK(name, ...)                                              \
-  namespace                                                                            \
-  {                                                                                    \
-  static ::nonius::benchmark_registrar                                                 \
-      NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, \
-                                                     __VA_ARGS__);                     \
+#define NONIUS_LOCAL_BENCHMARK(name, ...)                                                          \
+  namespace                                                                                        \
+  {                                                                                                \
+  static ::nonius::benchmark_registrar                                                             \
+    NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, __VA_ARGS__); \
   }
 
 NONIUS_LOCAL_BENCHMARK("NEDeconvolutionLayer_NCHW", [](nonius::chronometer meter) {
diff --git a/tools/kernel_report/kernel_report.py b/tools/kernel_report/kernel_report.py
index b8a601e..8940e88 100755
--- a/tools/kernel_report/kernel_report.py
+++ b/tools/kernel_report/kernel_report.py
@@ -14,8 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import argparse
+from os.path import dirname, realpath, join
 
 
 class Backend:
@@ -28,11 +28,8 @@ class Backend:
 
 class KernelReporter(object):
     def __init__(self, args):
-        # TODO: Remove os defendency - '/'
-        if args.base[0] != '/':
-            self.onertBase = os.getcwd() + '/' + args.base
-        else:
-            self.onertBase = args.base
+        root_path = dirname(dirname(dirname(realpath(__file__))))
+        self.onertBase = join(root_path, "runtime", "onert")
         if args.md5:
             self.printMD5 = True
         else:
@@ -181,7 +178,6 @@ if __name__ == '__main__':
         default='cpu,acl_cl,acl_neon',
         help="backend list to report (use comma)")
     arg_parser.add_argument("--md5", action='store_true', help="Print for md5")
-    arg_parser.add_argument("base", type=str, help="onert base directory")
     args = arg_parser.parse_args()
 
     report = KernelReporter(args)
diff --git a/tools/nnapi_quickcheck/CMakeLists.txt b/tools/nnapi_quickcheck/CMakeLists.txt
deleted file mode 100644
index c88155a..0000000
--- a/tools/nnapi_quickcheck/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-if(NOT BUILD_NNAPI_QUICKCHECK)
-  return()
-endif(NOT BUILD_NNAPI_QUICKCHECK)
-
-file(GLOB_RECURSE NNAPI_QUICKCHECK_LIB_SOURCES "lib/*.cpp")
-file(GLOB_RECURSE NNAPI_QUICKCHECK_LIB_TESTS "lib/*.test.cpp")
-list(REMOVE_ITEM NNAPI_QUICKCHECK_LIB_SOURCES ${NNAPI_QUICKCHECK_LIB_TESTS})
-
-add_library(nnapi_quickcheck_common ${NNAPI_QUICKCHECK_LIB_SOURCES})
-target_include_directories(nnapi_quickcheck_common PUBLIC "inc")
-target_link_libraries(nnapi_quickcheck_common nnfw_lib_misc)
-target_link_libraries(nnapi_quickcheck_common nnfw_lib_tflite)
-
-add_executable(nnapi_quickcheck_lib_env_test "lib/env.test.cpp")
-target_link_libraries(nnapi_quickcheck_lib_env_test nnapi_quickcheck_common)
-
-function(add_nnapi_quickcheck NAME)
-  add_executable(nnapi_quickcheck_${NAME} "tests/${NAME}.cpp")
-  nnfw_find_package(GTest)
-  target_link_libraries(nnapi_quickcheck_${NAME} gtest gtest_main pthread)
-  target_link_libraries(nnapi_quickcheck_${NAME} nnapi_quickcheck_common)
-endfunction(add_nnapi_quickcheck)
-
-add_nnapi_quickcheck(add_1)
-add_nnapi_quickcheck(add_2)
-add_nnapi_quickcheck(add_3)
-add_nnapi_quickcheck(add_4)
-add_nnapi_quickcheck(add_5)
-add_nnapi_quickcheck(add_6)
-add_nnapi_quickcheck(add_7)
-add_nnapi_quickcheck(add_8)
-add_nnapi_quickcheck(add_9)
-add_nnapi_quickcheck(add_quan_1)
-add_nnapi_quickcheck(div_1)
-add_nnapi_quickcheck(div_2)
-add_nnapi_quickcheck(sub_1)
-add_nnapi_quickcheck(sub_2)
-add_nnapi_quickcheck(sub_3)
-add_nnapi_quickcheck(sub_4)
-add_nnapi_quickcheck(sub_5)
-add_nnapi_quickcheck(sub_6)
-add_nnapi_quickcheck(mul_1)
-add_nnapi_quickcheck(mul_2)
-add_nnapi_quickcheck(mul_quan_1)
-add_nnapi_quickcheck(relu_1)
-add_nnapi_quickcheck(relu_quan_1)
-add_nnapi_quickcheck(relu_2)
-add_nnapi_quickcheck(relu_3)
-add_nnapi_quickcheck(relu6_1)
-add_nnapi_quickcheck(relu6_quan_1)
-add_nnapi_quickcheck(relu1_1)
-add_nnapi_quickcheck(conv_1)
-add_nnapi_quickcheck(conv_quan_1)
-add_nnapi_quickcheck(dconv_1)
-add_nnapi_quickcheck(dconv_quan_1)
-add_nnapi_quickcheck(max_pool_1)
-add_nnapi_quickcheck(max_pool_quan_1)
-add_nnapi_quickcheck(avg_pool_1)
-add_nnapi_quickcheck(avg_pool_quan_1)
-add_nnapi_quickcheck(concat_1)
-add_nnapi_quickcheck(concat_quan_1)
-add_nnapi_quickcheck(reshape_1)
-add_nnapi_quickcheck(reshape_quan_1)
-add_nnapi_quickcheck(fully_connected_1)
-add_nnapi_quickcheck(fully_connected_quan_1)
-add_nnapi_quickcheck(softmax_1)
-add_nnapi_quickcheck(softmax_2)
-add_nnapi_quickcheck(softmax_quan_1)
-add_nnapi_quickcheck(resize_bilinear_1)
-add_nnapi_quickcheck(topk_v2_1)
-add_nnapi_quickcheck(cast_1)
-add_nnapi_quickcheck(cast_q_to_f_1)
-add_nnapi_quickcheck(cast_2)
-add_nnapi_quickcheck(gather_1)
-add_nnapi_quickcheck(gather_2)
-add_nnapi_quickcheck(dequantize_1)
-add_nnapi_quickcheck(tanh_1)
-add_nnapi_quickcheck(logistic_quan_1)
-add_nnapi_quickcheck(split_1)
-add_nnapi_quickcheck(split_2)
-add_nnapi_quickcheck(split_3)
-add_nnapi_quickcheck(split_4)
diff --git a/tools/nnapi_quickcheck/inc/env.h b/tools/nnapi_quickcheck/inc/env.h
deleted file mode 100644
index c2efceb..0000000
--- a/tools/nnapi_quickcheck/inc/env.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ENV_UTILS_H__
-#define __ENV_UTILS_H__
-
-#include <string>
-
-#include <cstdint>
-
-class IntVar
-{
-public:
-  IntVar(const std::string &name, int32_t value);
-
-public:
-  int32_t operator()(void) const { return _value; }
-
-private:
-  int32_t _value;
-};
-
-class FloatVar
-{
-public:
-  FloatVar(const std::string &name, float value);
-
-public:
-  float operator()(void) const { return _value; }
-
-private:
-  float _value;
-};
-
-class StrVar
-{
-public:
-  StrVar(const std::string &name, const std::string &value);
-
-public:
-  const std::string &operator()(void) const { return _value; }
-
-private:
-  std::string _value;
-};
-
-#endif // __ENV_UTILS_H__
diff --git a/tools/nnapi_quickcheck/lib/env.cpp b/tools/nnapi_quickcheck/lib/env.cpp
deleted file mode 100644
index 005e876..0000000
--- a/tools/nnapi_quickcheck/lib/env.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "env.h"
-
-#include "misc/environment.h"
-
-//
-// Integer variable
-//
-IntVar::IntVar(const std::string &name, int32_t value) : _value{value}
-{
-  nnfw::misc::env::IntAccessor{name}.access(_value);
-}
-
-//
-// Float variable
-//
-FloatVar::FloatVar(const std::string &name, float value) : _value{value}
-{
-  nnfw::misc::env::FloatAccessor{name}.access(_value);
-}
-
-//
-// String variable
-//
-#include <cstdlib>
-
-StrVar::StrVar(const std::string &name, const std::string &value) : _value{value}
-{
-  auto env = std::getenv(name.c_str());
-
-  if (env)
-  {
-    _value = std::string{env};
-  }
-}
diff --git a/tools/nnapi_quickcheck/tests/add_1.cpp b/tools/nnapi_quickcheck/tests/add_1.cpp
deleted file mode 100644
index f5363f9..0000000
--- a/tools/nnapi_quickcheck/tests/add_1.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_1.lst b/tools/nnapi_quickcheck/tests/add_1.lst
deleted file mode 100644
index fa17cae..0000000
--- a/tools/nnapi_quickcheck/tests/add_1.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 16)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/add_2.cpp b/tools/nnapi_quickcheck/tests/add_2.cpp
deleted file mode 100644
index fe4d12f..0000000
--- a/tools/nnapi_quickcheck/tests/add_2.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_2.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure left data
-  const uint32_t left_size = LEFT_N * LEFT_C * LEFT_H * LEFT_W;
-  float left_data[left_size] = {
-      0.0f,
-  };
-
-  // Fill left data with random data
-  {
-    std::normal_distribution<float> left_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < left_size; ++off)
-    {
-      left_data[off++] = left_dist(random);
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                       {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(left_data),
-                                       left_size * sizeof(float));
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read LHS from Tensor #1
-    //  - Read RHS from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_2.lst b/tools/nnapi_quickcheck/tests/add_2.lst
deleted file mode 100644
index fa17cae..0000000
--- a/tools/nnapi_quickcheck/tests/add_2.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 16)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/add_3.cpp b/tools/nnapi_quickcheck/tests/add_3.cpp
deleted file mode 100644
index ce409cc..0000000
--- a/tools/nnapi_quickcheck/tests/add_3.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/TensorShapeUtils.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_3, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-#define STR_VALUE(NAME, VALUE) StrVar NAME##_Value(#NAME, VALUE);
-#include "add_3.lst"
-#undef STR_VALUE
-
-  const auto LHS_SHAPE = nnfw::misc::tensor::Shape::from(LHS_SHAPE_Value());
-  const auto RHS_SHAPE = nnfw::misc::tensor::Shape::from(RHS_SHAPE_Value());
-  const auto OUT_SHAPE = nnfw::tflite::broadcast(LHS_SHAPE, RHS_SHAPE);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LHS_SHAPE);
-  PRINT_VALUE(RHS_SHAPE);
-  PRINT_VALUE(OUT_SHAPE);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    using nnfw::tflite::as_dims;
-
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        as_dims(OUT_SHAPE), quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        as_dims(LHS_SHAPE), quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        as_dims(RHS_SHAPE), quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = 0;
-  param.tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(param.verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(param.tolerance);
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_3.lst b/tools/nnapi_quickcheck/tests/add_3.lst
deleted file mode 100644
index 1981db4..0000000
--- a/tools/nnapi_quickcheck/tests/add_3.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef STR_VALUE
-#error "STR_VALUE should be defined"
-#endif // STR_VALUE
-
-STR_VALUE(LHS_SHAPE, "1,3,16,16")
-STR_VALUE(RHS_SHAPE, "1,3,16,16")
diff --git a/tools/nnapi_quickcheck/tests/add_4.cpp b/tools/nnapi_quickcheck/tests/add_4.cpp
deleted file mode 100644
index b1231dd..0000000
--- a/tools/nnapi_quickcheck/tests/add_4.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_4, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_4.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_4.lst b/tools/nnapi_quickcheck/tests/add_4.lst
deleted file mode 100644
index 6b28900..0000000
--- a/tools/nnapi_quickcheck/tests/add_4.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 2)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 8)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 2)
-INT_VALUE(RIGHT_H, 1)
-INT_VALUE(RIGHT_W, 8)
diff --git a/tools/nnapi_quickcheck/tests/add_5.cpp b/tools/nnapi_quickcheck/tests/add_5.cpp
deleted file mode 100644
index f900153c..0000000
--- a/tools/nnapi_quickcheck/tests/add_5.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_5, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_5.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT = RIGHT_Value();
-
-  const int32_t OFM_N = LEFT_N;
-  const int32_t OFM_C = LEFT_C;
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT} /* dims */, quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_5.lst b/tools/nnapi_quickcheck/tests/add_5.lst
deleted file mode 100644
index eb316b6..0000000
--- a/tools/nnapi_quickcheck/tests/add_5.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 8)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT, 1)
diff --git a/tools/nnapi_quickcheck/tests/add_6.cpp b/tools/nnapi_quickcheck/tests/add_6.cpp
deleted file mode 100644
index 83b87ef..0000000
--- a/tools/nnapi_quickcheck/tests/add_6.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_6, simple_test)
-{
-  int verbose = 1;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_6.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT = RIGHT_Value();
-
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_H, LEFT_W} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT} /* dims */, quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_6.lst b/tools/nnapi_quickcheck/tests/add_6.lst
deleted file mode 100644
index 75db4c8..0000000
--- a/tools/nnapi_quickcheck/tests/add_6.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_H, 8)
-INT_VALUE(LEFT_W, 2)
-
-INT_VALUE(RIGHT, 1)
diff --git a/tools/nnapi_quickcheck/tests/add_7.cpp b/tools/nnapi_quickcheck/tests/add_7.cpp
deleted file mode 100644
index 732320f..0000000
--- a/tools/nnapi_quickcheck/tests/add_7.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_7, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_7.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_C = LEFT_C;
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_H, RIGHT_W, RIGHT_C} /* dims */, quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_7.lst b/tools/nnapi_quickcheck/tests/add_7.lst
deleted file mode 100644
index 1dc8b61..0000000
--- a/tools/nnapi_quickcheck/tests/add_7.lst
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 8)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 8)
-INT_VALUE(RIGHT_W, 1)
diff --git a/tools/nnapi_quickcheck/tests/add_8.cpp b/tools/nnapi_quickcheck/tests/add_8.cpp
deleted file mode 100644
index d89e977..0000000
--- a/tools/nnapi_quickcheck/tests/add_8.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_8, simple_test)
-{
-  int verbose = 1;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_8.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure left data
-  const uint32_t left_size = LEFT_N * LEFT_C * LEFT_H * LEFT_W;
-  const uint32_t right_size = RIGHT_N * RIGHT_C * RIGHT_H * RIGHT_W;
-  float left_data[left_size] = {
-      0.0f,
-  };
-  float right_data[right_size] = {
-      0.0f,
-  };
-
-  // Fill left data with random data
-  {
-    std::normal_distribution<float> left_dist(-1.0f, +1.0f);
-    int value = 10;
-    for (uint32_t off = 0; off < left_size; ++off)
-    {
-      left_data[off] = value;
-      std::cout << left_data[off] << std::endl;
-    }
-    value = 1;
-    for (uint32_t off = 0; off < right_size; ++off)
-    {
-      right_data[off] = value++;
-      std::cout << right_data[off] << std::endl;
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                       {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(left_data),
-                                       left_size * sizeof(float));
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteFloat32 /* type */, "right" /* name */, {RIGHT_C} /* dims */, quantization,
-        //{RIGHT_W, RIGHT_C} /* dims */, quantization,
-        reinterpret_cast<const char *>(right_data), right_size * sizeof(float));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read LHS from Tensor #1
-    //  - Read RHS from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_8.lst b/tools/nnapi_quickcheck/tests/add_8.lst
deleted file mode 100644
index 3119c7f..0000000
--- a/tools/nnapi_quickcheck/tests/add_8.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_H, 3)
-INT_VALUE(LEFT_W, 2)
-INT_VALUE(LEFT_C, 4)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_H, 1)
-INT_VALUE(RIGHT_W, 1)
-INT_VALUE(RIGHT_C, 4)
diff --git a/tools/nnapi_quickcheck/tests/add_9.cpp b/tools/nnapi_quickcheck/tests/add_9.cpp
deleted file mode 100644
index fd4e1f9..0000000
--- a/tools/nnapi_quickcheck/tests/add_9.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_9, simple_test)
-{
-  int verbose = 1;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_9.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_VALUE(LEFT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-  PRINT_VALUE(OFM_C);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure left data
-  const uint32_t left_size = LEFT_N * LEFT_C * LEFT_H * LEFT_W;
-  const uint32_t right_size = RIGHT_N * RIGHT_C * RIGHT_H * RIGHT_W;
-  float left_data[left_size] = {
-      0.0f,
-  };
-  float right_data[right_size] = {
-      0.0f,
-  };
-
-  // Fill left data with random data
-  {
-    std::normal_distribution<float> left_dist(-1.0f, +1.0f);
-    float value = 10.0f;
-    for (uint32_t off = 0; off < left_size; ++off)
-    {
-      left_data[off] = value;
-    }
-    value = 1.0f;
-    for (uint32_t off = 0; off < right_size; ++off)
-    {
-      right_data[off] = value++;
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(
-        1, kTfLiteFloat32 /* type */, "left" /* name */, {LEFT_W, LEFT_C} /* dims */, quantization,
-        reinterpret_cast<const char *>(left_data), left_size * sizeof(float));
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                       {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                       quantization, reinterpret_cast<const char *>(right_data),
-                                       right_size * sizeof(float));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read LHS from Tensor #1
-    //  - Read RHS from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_9.lst b/tools/nnapi_quickcheck/tests/add_9.lst
deleted file mode 100644
index 52a1f1a..0000000
--- a/tools/nnapi_quickcheck/tests/add_9.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_H, 1)
-INT_VALUE(LEFT_W, 3)
-INT_VALUE(LEFT_C, 4)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_H, 2)
-INT_VALUE(RIGHT_W, 3)
-INT_VALUE(RIGHT_C, 4)
diff --git a/tools/nnapi_quickcheck/tests/add_quan_1.cpp b/tools/nnapi_quickcheck/tests/add_quan_1.cpp
deleted file mode 100644
index e3d8512..0000000
--- a/tools/nnapi_quickcheck/tests/add_quan_1.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_add_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "add_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    quantization.scale = 2.0f;
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    quantization.scale = 1.0f;
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteUInt8 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_ADD, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/add_quan_1.lst b/tools/nnapi_quickcheck/tests/add_quan_1.lst
deleted file mode 100644
index fa17cae..0000000
--- a/tools/nnapi_quickcheck/tests/add_quan_1.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 16)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/avg_pool_1.cpp b/tools/nnapi_quickcheck/tests/avg_pool_1.cpp
deleted file mode 100644
index 052c689..0000000
--- a/tools/nnapi_quickcheck/tests/avg_pool_1.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_avg_pool_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "avg_pool_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = (IFM_H - KER_H) + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) + 1;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Max Pooling Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLitePoolParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = 1;
-    param->stride_height = 1;
-    param->filter_width = KER_W;
-    param->filter_height = KER_H;
-    param->activation = kTfLiteActNone;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_AVERAGE_POOL_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/avg_pool_1.lst b/tools/nnapi_quickcheck/tests/avg_pool_1.lst
deleted file mode 100644
index 02d86d4..0000000
--- a/tools/nnapi_quickcheck/tests/avg_pool_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/avg_pool_quan_1.cpp b/tools/nnapi_quickcheck/tests/avg_pool_quan_1.cpp
deleted file mode 100644
index 86f35f7..0000000
--- a/tools/nnapi_quickcheck/tests/avg_pool_quan_1.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_avg_pool_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "avg_pool_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = (IFM_H - KER_H) + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) + 1;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Max Pooling Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLitePoolParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = 1;
-    param->stride_height = 1;
-    param->filter_width = KER_W;
-    param->filter_height = KER_H;
-    param->activation = kTfLiteActNone;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_AVERAGE_POOL_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/avg_pool_quan_1.lst b/tools/nnapi_quickcheck/tests/avg_pool_quan_1.lst
deleted file mode 100644
index 02d86d4..0000000
--- a/tools/nnapi_quickcheck/tests/avg_pool_quan_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/cast_1.cpp b/tools/nnapi_quickcheck/tests/cast_1.cpp
deleted file mode 100644
index 788cd57..0000000
--- a/tools/nnapi_quickcheck/tests/cast_1.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_cast_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "cast_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Cast Node
-    // Run CAST and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CAST, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/cast_1.lst b/tools/nnapi_quickcheck/tests/cast_1.lst
deleted file mode 100644
index a0077cb..0000000
--- a/tools/nnapi_quickcheck/tests/cast_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_H, 320)
-INT_VALUE(IFM_W, 320)
diff --git a/tools/nnapi_quickcheck/tests/cast_2.cpp b/tools/nnapi_quickcheck/tests/cast_2.cpp
deleted file mode 100644
index a9e99ee..0000000
--- a/tools/nnapi_quickcheck/tests/cast_2.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_cast_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "cast_2.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteInt32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Cast Node
-    // Run CAST and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CAST, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/cast_2.lst b/tools/nnapi_quickcheck/tests/cast_2.lst
deleted file mode 100644
index a0077cb..0000000
--- a/tools/nnapi_quickcheck/tests/cast_2.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_H, 320)
-INT_VALUE(IFM_W, 320)
diff --git a/tools/nnapi_quickcheck/tests/cast_q_to_f_1.cpp b/tools/nnapi_quickcheck/tests/cast_q_to_f_1.cpp
deleted file mode 100644
index 4af6c77..0000000
--- a/tools/nnapi_quickcheck/tests/cast_q_to_f_1.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_cast_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "cast_q_to_f_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Cast Node
-    // Run CAST and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CAST, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/cast_q_to_f_1.lst b/tools/nnapi_quickcheck/tests/cast_q_to_f_1.lst
deleted file mode 100644
index a0077cb..0000000
--- a/tools/nnapi_quickcheck/tests/cast_q_to_f_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_H, 320)
-INT_VALUE(IFM_W, 320)
diff --git a/tools/nnapi_quickcheck/tests/concat_1.cpp b/tools/nnapi_quickcheck/tests/concat_1.cpp
deleted file mode 100644
index d2cb1aa..0000000
--- a/tools/nnapi_quickcheck/tests/concat_1.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_concat_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "concat_1.lst"
-#undef INT_VALUE
-
-  // TODO Allow users to set concat axis!
-  const int32_t CONCAT_COUNT = CONCAT_COUNT_Value();
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  int32_t OFM_C = 0;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(CONCAT_COUNT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Randomize IFM depth
-  std::default_random_engine generator(SEED);
-  std::uniform_int_distribution<int> distribution(1, 8);
-
-  std::vector<int32_t> depths;
-
-  for (int32_t n = 0; n < CONCAT_COUNT; ++n)
-  {
-    const auto depth = distribution(generator);
-
-    OFM_C += depth;
-    depths.emplace_back(depth);
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(depths.size() + 1);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM(s)
-    std::vector<int> ifm_indexes;
-
-    for (uint32_t n = 0; n < depths.size(); ++n)
-    {
-      const auto ifm_index = 1 + n;
-      const auto IFM_C = depths.at(n);
-
-      interp.SetTensorParametersReadWrite(ifm_index, kTfLiteFloat32 /* type */, "input" /* name */,
-                                          {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-      ifm_indexes.emplace_back(ifm_index);
-    }
-
-    // Add Concat Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteConcatenationParams>();
-
-    param->activation = kTfLiteActNone;
-    param->axis = 3;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters(ifm_indexes, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CONCATENATION, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs(ifm_indexes);
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/concat_1.lst b/tools/nnapi_quickcheck/tests/concat_1.lst
deleted file mode 100644
index db70d4c..0000000
--- a/tools/nnapi_quickcheck/tests/concat_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(CONCAT_COUNT, 3)
-
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/concat_quan_1.cpp b/tools/nnapi_quickcheck/tests/concat_quan_1.cpp
deleted file mode 100644
index f861ac8..0000000
--- a/tools/nnapi_quickcheck/tests/concat_quan_1.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_concat_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "concat_quan_1.lst"
-#undef INT_VALUE
-
-  // TODO Allow users to set concat axis!
-  const int32_t CONCAT_COUNT = CONCAT_COUNT_Value();
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  int32_t OFM_C = 0;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(CONCAT_COUNT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Randomize IFM depth
-  std::default_random_engine generator(SEED);
-  std::uniform_int_distribution<int> distribution(1, 8);
-
-  std::vector<int32_t> depths;
-
-  for (int32_t n = 0; n < CONCAT_COUNT; ++n)
-  {
-    const auto depth = distribution(generator);
-
-    OFM_C += depth;
-    depths.emplace_back(depth);
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(depths.size() + 1);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM(s)
-    std::vector<int> ifm_indexes;
-
-    for (uint32_t n = 0; n < depths.size(); ++n)
-    {
-      const auto ifm_index = 1 + n;
-      const auto IFM_C = depths.at(n);
-
-      interp.SetTensorParametersReadWrite(ifm_index, kTfLiteUInt8 /* type */, "input" /* name */,
-                                          {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-      ifm_indexes.emplace_back(ifm_index);
-    }
-
-    // Add Concat Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteConcatenationParams>();
-
-    param->activation = kTfLiteActNone;
-    param->axis = 3;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters(ifm_indexes, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CONCATENATION, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs(ifm_indexes);
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/concat_quan_1.lst b/tools/nnapi_quickcheck/tests/concat_quan_1.lst
deleted file mode 100644
index db70d4c..0000000
--- a/tools/nnapi_quickcheck/tests/concat_quan_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(CONCAT_COUNT, 3)
-
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/conv_1.cpp b/tools/nnapi_quickcheck/tests/conv_1.cpp
deleted file mode 100644
index b5b145c..0000000
--- a/tools/nnapi_quickcheck/tests/conv_1.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_conv_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "conv_1.lst"
-#undef INT_VALUE
-
-  const int32_t STRIDE_H = STRIDE_H_Value();
-  const int32_t STRIDE_W = STRIDE_W_Value();
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_N = KER_N_Value();
-  const int32_t KER_C = IFM_C_Value();
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = KER_N;
-  const int32_t OFM_H = (IFM_H - KER_H) / STRIDE_H + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) / STRIDE_W + 1;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(STRIDE_H);
-  PRINT_VALUE(STRIDE_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_N);
-  PRINT_VALUE(KER_C);
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_N * KER_C * KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off++] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_N;
-  float bias_data[bias_size] = {
-      0.0f,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = bias_dist(random);
-    }
-  }
-
-  // Assumption on this example
-  assert(IFM_C == KER_C);
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(5);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteFloat32 /* type */, "filter" /* name */, {KER_N, KER_H, KER_W, KER_C} /* dims */,
-        quantization, reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(float));
-
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteFloat32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(float));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteConvParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = STRIDE_W;
-    param->stride_height = STRIDE_H;
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CONV_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/conv_1.lst b/tools/nnapi_quickcheck/tests/conv_1.lst
deleted file mode 100644
index c01fc90..0000000
--- a/tools/nnapi_quickcheck/tests/conv_1.lst
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_N, 1)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(STRIDE_H, 1)
-INT_VALUE(STRIDE_W, 1)
diff --git a/tools/nnapi_quickcheck/tests/conv_quan_1.cpp b/tools/nnapi_quickcheck/tests/conv_quan_1.cpp
deleted file mode 100644
index 2824547..0000000
--- a/tools/nnapi_quickcheck/tests/conv_quan_1.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_conv_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "conv_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t STRIDE_H = STRIDE_H_Value();
-  const int32_t STRIDE_W = STRIDE_W_Value();
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_N = KER_N_Value();
-  const int32_t KER_C = IFM_C_Value();
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = KER_N;
-  const int32_t OFM_H = (IFM_H - KER_H) / STRIDE_H + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) / STRIDE_W + 1;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(STRIDE_H);
-  PRINT_VALUE(STRIDE_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_N);
-  PRINT_VALUE(KER_C);
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_N * KER_C * KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off++] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_N;
-  int32_t bias_data[bias_size] = {
-      0,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = static_cast<int32_t>(bias_dist(random));
-    }
-  }
-
-  // Assumption on this example
-  assert(IFM_C == KER_C);
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(5);
-
-    // Configure OFM
-    float max_scale = (KER_N, KER_C * KER_H * KER_W) *
-                      std::numeric_limits<uint8_t>::max(); // * IFM_scale(1.0f) * kernel_scale(1.0f)
-    quantization.scale = max_scale;
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    quantization.scale = 1.0f;
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteUInt8 /* type */, "filter" /* name */, {KER_N, KER_H, KER_W, KER_C} /* dims */,
-        quantization, reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(uint8_t));
-
-    quantization.scale *= quantization.scale;
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteInt32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(int32_t));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteConvParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = STRIDE_W;
-    param->stride_height = STRIDE_H;
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_CONV_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/conv_quan_1.lst b/tools/nnapi_quickcheck/tests/conv_quan_1.lst
deleted file mode 100644
index c01fc90..0000000
--- a/tools/nnapi_quickcheck/tests/conv_quan_1.lst
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_N, 1)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(STRIDE_H, 1)
-INT_VALUE(STRIDE_W, 1)
diff --git a/tools/nnapi_quickcheck/tests/dconv_1.cpp b/tools/nnapi_quickcheck/tests/dconv_1.cpp
deleted file mode 100644
index 36ec7a9..0000000
--- a/tools/nnapi_quickcheck/tests/dconv_1.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_dconv_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "dconv_1.lst"
-#undef INT_VALUE
-
-  const int32_t STRIDE_H = STRIDE_H_Value();
-  const int32_t STRIDE_W = STRIDE_W_Value();
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_C = KER_C_Value();
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = KER_C;
-  const int32_t OFM_H = (IFM_H - KER_H) / STRIDE_H + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) / STRIDE_W + 1;
-
-  const int32_t MULTIPLIER = MULTIPLIER_Value();
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_C);
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(STRIDE_H);
-  PRINT_VALUE(STRIDE_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(MULTIPLIER);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  assert(MULTIPLIER * IFM_C == KER_C);
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_C * KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_C;
-  float bias_data[bias_size] = {
-      0.0f,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = bias_dist(random);
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(4);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteFloat32 /* type */, "filter" /* name */, {1, KER_H, KER_W, KER_C} /* dims */,
-        quantization, reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(float));
-
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteFloat32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(float));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteDepthwiseConvParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = STRIDE_W;
-    param->stride_height = STRIDE_H;
-    param->depth_multiplier = MULTIPLIER;
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_DEPTHWISE_CONV_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/dconv_1.lst b/tools/nnapi_quickcheck/tests/dconv_1.lst
deleted file mode 100644
index da851ae..0000000
--- a/tools/nnapi_quickcheck/tests/dconv_1.lst
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_C, 2)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(MULTIPLIER, 1)
-
-INT_VALUE(STRIDE_H, 1)
-INT_VALUE(STRIDE_W, 1)
diff --git a/tools/nnapi_quickcheck/tests/dconv_quan_1.cpp b/tools/nnapi_quickcheck/tests/dconv_quan_1.cpp
deleted file mode 100644
index 8305ad1..0000000
--- a/tools/nnapi_quickcheck/tests/dconv_quan_1.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_dconv_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "dconv_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t STRIDE_H = STRIDE_H_Value();
-  const int32_t STRIDE_W = STRIDE_W_Value();
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_C = KER_C_Value();
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = KER_C;
-  const int32_t OFM_H = (IFM_H - KER_H) / STRIDE_H + 1;
-  const int32_t OFM_W = (IFM_W - KER_W) / STRIDE_W + 1;
-
-  const int32_t MULTIPLIER = MULTIPLIER_Value();
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_C);
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(STRIDE_H);
-  PRINT_VALUE(STRIDE_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(MULTIPLIER);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  assert(MULTIPLIER * IFM_C == KER_C);
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_C * KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_C;
-  int32_t bias_data[bias_size] = {
-      0,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = static_cast<int32_t>(bias_dist(random));
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(4);
-
-    // Configure OFM
-    float max_scale = (1 * KER_C * KER_H * KER_W) *
-                      std::numeric_limits<uint8_t>::max(); // * IFM_scale(1.0f) * kernel_scale(1.0f)
-    quantization.scale = max_scale;
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    quantization.scale = 1.0f;
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteUInt8 /* type */, "filter" /* name */, {1, KER_H, KER_W, KER_C} /* dims */,
-        quantization, reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(uint8_t));
-
-    quantization.scale *= quantization.scale;
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteInt32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(int32_t));
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteDepthwiseConvParams>();
-
-    param->padding = kTfLitePaddingValid;
-    param->stride_width = STRIDE_W;
-    param->stride_height = STRIDE_H;
-    param->depth_multiplier = MULTIPLIER;
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_DEPTHWISE_CONV_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/dconv_quan_1.lst b/tools/nnapi_quickcheck/tests/dconv_quan_1.lst
deleted file mode 100644
index da851ae..0000000
--- a/tools/nnapi_quickcheck/tests/dconv_quan_1.lst
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_C, 2)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(MULTIPLIER, 1)
-
-INT_VALUE(STRIDE_H, 1)
-INT_VALUE(STRIDE_W, 1)
diff --git a/tools/nnapi_quickcheck/tests/dequantize_1.cpp b/tools/nnapi_quickcheck/tests/dequantize_1.cpp
deleted file mode 100644
index e725fa2..0000000
--- a/tools/nnapi_quickcheck/tests/dequantize_1.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_dequantize_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "dequantize_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add DEQUANTIZE Node
-    // Run DEQUANTIZE and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_DEQUANTIZE, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/dequantize_1.lst b/tools/nnapi_quickcheck/tests/dequantize_1.lst
deleted file mode 100644
index a0077cb..0000000
--- a/tools/nnapi_quickcheck/tests/dequantize_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_H, 320)
-INT_VALUE(IFM_W, 320)
diff --git a/tools/nnapi_quickcheck/tests/div_1.cpp b/tools/nnapi_quickcheck/tests/div_1.cpp
deleted file mode 100644
index 26dfbbe..0000000
--- a/tools/nnapi_quickcheck/tests/div_1.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_div_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "div_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Division Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Div and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_DIV, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/div_1.lst b/tools/nnapi_quickcheck/tests/div_1.lst
deleted file mode 100644
index fa17cae..0000000
--- a/tools/nnapi_quickcheck/tests/div_1.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 16)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/div_2.cpp b/tools/nnapi_quickcheck/tests/div_2.cpp
deleted file mode 100644
index df4efa4..0000000
--- a/tools/nnapi_quickcheck/tests/div_2.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_div_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "div_2.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT = RIGHT_Value();
-
-  const int32_t OFM_N = LEFT_N;
-  const int32_t OFM_C = LEFT_C;
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT} /* dims */, quantization);
-
-    // Add Division Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Div and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_DIV, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/div_2.lst b/tools/nnapi_quickcheck/tests/div_2.lst
deleted file mode 100644
index cd36ac1..0000000
--- a/tools/nnapi_quickcheck/tests/div_2.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT, 1)
diff --git a/tools/nnapi_quickcheck/tests/fully_connected_1.cpp b/tools/nnapi_quickcheck/tests/fully_connected_1.cpp
deleted file mode 100644
index 43cd0a4..0000000
--- a/tools/nnapi_quickcheck/tests/fully_connected_1.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-template <typename T> T *make_malloc(void) { return reinterpret_cast<T *>(malloc(sizeof(T))); }
-
-TEST(NNAPI_Quickcheck_fully_connected_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "conv_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_N_Value();
-  const int32_t KER_W = IFM_C_Value() * IFM_H_Value() * IFM_W_Value();
-
-  const int32_t OUT_LEN = KER_H;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUT_LEN);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off++] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_H;
-  float bias_data[bias_size] = {
-      0.0f,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = bias_dist(random);
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(4);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, KER_H} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteFloat32 /* type */, "filter" /* name */, {KER_H, KER_W} /* dims */, quantization,
-        reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(float));
-
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteFloat32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(float));
-
-    // Add Fully Connected Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_malloc<TfLiteFullyConnectedParams>();
-
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_FULLY_CONNECTED, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/fully_connected_1.lst b/tools/nnapi_quickcheck/tests/fully_connected_1.lst
deleted file mode 100644
index 22acb9f..0000000
--- a/tools/nnapi_quickcheck/tests/fully_connected_1.lst
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_H, 1)
diff --git a/tools/nnapi_quickcheck/tests/fully_connected_quan_1.cpp b/tools/nnapi_quickcheck/tests/fully_connected_quan_1.cpp
deleted file mode 100644
index 2c68835..0000000
--- a/tools/nnapi_quickcheck/tests/fully_connected_quan_1.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-template <typename T> T *make_malloc(void) { return reinterpret_cast<T *>(malloc(sizeof(T))); }
-
-TEST(NNAPI_Quickcheck_fully_connected_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "fully_connected_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = IFM_C_Value() * IFM_H_Value() * IFM_W_Value();
-
-  const int32_t OUT_LEN = KER_H;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUT_LEN);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure Kernel Data
-  const uint32_t kernel_size = KER_H * KER_W;
-  float kernel_data[kernel_size] = {
-      0.0f,
-  };
-
-  // Fill kernel data with random data
-  {
-    std::normal_distribution<float> kernel_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < kernel_size; ++off)
-    {
-      kernel_data[off++] = kernel_dist(random);
-    }
-  }
-
-  // Configure Bias Data
-  const auto bias_size = KER_H;
-  int32_t bias_data[bias_size] = {
-      0,
-  };
-
-  // Fill bias data with random data
-  {
-    std::normal_distribution<float> bias_dist(-1.0f, +1.0f);
-
-    for (uint32_t off = 0; off < bias_size; ++off)
-    {
-      bias_data[off] = static_cast<int32_t>(bias_dist(random));
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-    quantization.scale = FLOAT_NEAREST_TO_1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(4);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, KER_H} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // NOTE kernel_data & bias_data should live longer than interpreter!
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteUInt8 /* type */, "filter" /* name */, {KER_H, KER_W} /* dims */, quantization,
-        reinterpret_cast<const char *>(kernel_data), kernel_size * sizeof(uint8_t));
-
-    interp.SetTensorParametersReadOnly(
-        3, kTfLiteInt32 /* type */, "bias" /* name */, {bias_size} /* dims */, quantization,
-        reinterpret_cast<const char *>(bias_data), bias_size * sizeof(int32_t));
-
-    // Add Fully Connected Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_malloc<TfLiteFullyConnectedParams>();
-
-    param->activation = kTfLiteActRelu;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    //  - Read Filter from Tensor #2,
-    //  - Read Bias from Tensor #3
-    interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_FULLY_CONNECTED, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/fully_connected_quan_1.lst b/tools/nnapi_quickcheck/tests/fully_connected_quan_1.lst
deleted file mode 100644
index 22acb9f..0000000
--- a/tools/nnapi_quickcheck/tests/fully_connected_quan_1.lst
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_H, 1)
diff --git a/tools/nnapi_quickcheck/tests/gather_1.cpp b/tools/nnapi_quickcheck/tests/gather_1.cpp
deleted file mode 100644
index 4ab164e..0000000
--- a/tools/nnapi_quickcheck/tests/gather_1.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_gather_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "gather_1.lst"
-#undef INT_VALUE
-
-  const int32_t INPUT_DATA = INPUT_DATA_Value();
-  const int32_t INDEX_DATA = INDEX_DATA_Value();
-
-  const int32_t OUTPUT_DATA = INDEX_DATA;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(INPUT_DATA);
-  PRINT_VALUE(INDEX_DATA);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUTPUT_DATA);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure INPUT_DATA
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {INPUT_DATA} /* dims */, quantization);
-
-    // Configure INDEX_DATA
-    interp.SetTensorParametersReadWrite(1, kTfLiteInt32 /* type */, "index" /* name */,
-                                        {INDEX_DATA} /* dims */, quantization);
-
-    // Configure OUTPUT_VALUES
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "output_data" /* name */,
-                                        {OUTPUT_DATA} /* dims */, quantization);
-
-    auto *param = reinterpret_cast<TfLiteGatherParams *>(malloc(sizeof(TfLiteGatherParams)));
-
-    param->axis = 0;
-
-    // Add GATHER Node
-    // Run GATHER and store its result into Tensor #2
-    //  - Read input data and index_data from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, {2}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_GATHER, 1));
-
-    // Set Tensor #0 and #1 as Input, and Tensor #2 as Output
-    interp.SetInputs({0, 1});
-    interp.SetOutputs({2});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/gather_1.lst b/tools/nnapi_quickcheck/tests/gather_1.lst
deleted file mode 100644
index 923a056..0000000
--- a/tools/nnapi_quickcheck/tests/gather_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(INPUT_DATA, 8192)
-INT_VALUE(INDEX_DATA, 300)
diff --git a/tools/nnapi_quickcheck/tests/gather_2.cpp b/tools/nnapi_quickcheck/tests/gather_2.cpp
deleted file mode 100644
index ac9ec8b..0000000
--- a/tools/nnapi_quickcheck/tests/gather_2.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_gather_2, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "gather_2.lst"
-#undef INT_VALUE
-
-  const int32_t INPUT_DATA_H = INPUT_DATA_H_Value();
-  const int32_t INPUT_DATA_W = INPUT_DATA_W_Value();
-  const int32_t INDEX_DATA = INDEX_DATA_Value();
-
-  const int32_t OUTPUT_DATA_H = INPUT_DATA_H;
-  const int32_t OUTPUT_DATA_W = INDEX_DATA;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(INPUT_DATA_H);
-  PRINT_VALUE(INPUT_DATA_W);
-  PRINT_VALUE(INDEX_DATA);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUTPUT_DATA_H);
-  PRINT_VALUE(OUTPUT_DATA_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure INPUT_DATA
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {INPUT_DATA_H, INPUT_DATA_W} /* dims */, quantization);
-
-    // Configure INDEX_DATA
-    interp.SetTensorParametersReadWrite(1, kTfLiteInt32 /* type */, "index" /* name */,
-                                        {INDEX_DATA} /* dims */, quantization);
-
-    // Configure OUTPUT_VALUES
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "output_data" /* name */,
-                                        {OUTPUT_DATA_H, OUTPUT_DATA_W} /* dims */, quantization);
-
-    auto *param = reinterpret_cast<TfLiteGatherParams *>(malloc(sizeof(TfLiteGatherParams)));
-
-    param->axis = 0;
-
-    // Add GATHER Node
-    // Run GATHER and store its result into Tensor #2
-    //  - Read input data and index_data from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, {2}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_GATHER, 1));
-
-    // Set Tensor #0 and #1 as Input, and Tensor #2 as Output
-    interp.SetInputs({0, 1});
-    interp.SetOutputs({2});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/gather_2.lst b/tools/nnapi_quickcheck/tests/gather_2.lst
deleted file mode 100644
index 5bf6bd3..0000000
--- a/tools/nnapi_quickcheck/tests/gather_2.lst
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(INPUT_DATA_H, 128192)
-INT_VALUE(INPUT_DATA_W, 4)
-INT_VALUE(INDEX_DATA, 300)
diff --git a/tools/nnapi_quickcheck/tests/logistic_quan_1.cpp b/tools/nnapi_quickcheck/tests/logistic_quan_1.cpp
deleted file mode 100644
index 0b0a690..0000000
--- a/tools/nnapi_quickcheck/tests/logistic_quan_1.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_logistic_quan_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "logistic_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams in_quantization;
-    in_quantization.scale = 0.5f;
-    in_quantization.zero_point = 0;
-
-    TfLiteQuantizationParams out_quantization;
-    out_quantization.scale = 1.f / 256;
-    out_quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, out_quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, in_quantization);
-
-    // Add Logistic Node
-    // Run Logistic and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_LOGISTIC, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/logistic_quan_1.lst b/tools/nnapi_quickcheck/tests/logistic_quan_1.lst
deleted file mode 100644
index 9b3d8eb..0000000
--- a/tools/nnapi_quickcheck/tests/logistic_quan_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 1)
-INT_VALUE(IFM_H, 2)
-INT_VALUE(IFM_W, 2)
diff --git a/tools/nnapi_quickcheck/tests/max_pool_1.cpp b/tools/nnapi_quickcheck/tests/max_pool_1.cpp
deleted file mode 100644
index 62f985d..0000000
--- a/tools/nnapi_quickcheck/tests/max_pool_1.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_max_pool_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "max_pool_1.lst"
-#undef INT_VALUE
-
-  const TfLitePadding PADDING_TYPE = static_cast<TfLitePadding>(PADDING_TYPE_Value());
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = OFM_H_Value();
-  const int32_t OFM_W = OFM_W_Value();
-
-  assert((OFM_H >= (IFM_H - KER_H)));
-  assert((OFM_W >= (IFM_W - KER_W)));
-  assert((kTfLitePaddingSame == PADDING_TYPE) || (kTfLitePaddingValid == PADDING_TYPE));
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(PADDING_TYPE);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Max Pooling Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLitePoolParams>();
-
-    param->padding = PADDING_TYPE;
-    param->stride_width = 1;
-    param->stride_height = 1;
-    param->filter_width = KER_W;
-    param->filter_height = KER_H;
-    param->activation = kTfLiteActNone;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_MAX_POOL_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/max_pool_1.lst b/tools/nnapi_quickcheck/tests/max_pool_1.lst
deleted file mode 100644
index 4b5c130..0000000
--- a/tools/nnapi_quickcheck/tests/max_pool_1.lst
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_N, 1)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(OFM_H, 1)
-INT_VALUE(OFM_W, 1)
-
-// Default is kTfLitePaddingValid (= 2)
-INT_VALUE(PADDING_TYPE, 2)
diff --git a/tools/nnapi_quickcheck/tests/max_pool_quan_1.cpp b/tools/nnapi_quickcheck/tests/max_pool_quan_1.cpp
deleted file mode 100644
index 2c05a7d..0000000
--- a/tools/nnapi_quickcheck/tests/max_pool_quan_1.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_max_pool_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "max_pool_quan_1.lst"
-#undef INT_VALUE
-
-  const TfLitePadding PADDING_TYPE = static_cast<TfLitePadding>(PADDING_TYPE_Value());
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t KER_H = KER_H_Value();
-  const int32_t KER_W = KER_W_Value();
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = OFM_H_Value();
-  const int32_t OFM_W = OFM_W_Value();
-
-  assert((OFM_H >= (IFM_H - KER_H)));
-  assert((OFM_W >= (IFM_W - KER_W)));
-  assert((kTfLitePaddingSame == PADDING_TYPE) || (kTfLitePaddingValid == PADDING_TYPE));
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(PADDING_TYPE);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(KER_H);
-  PRINT_VALUE(KER_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Max Pooling Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLitePoolParams>();
-
-    param->padding = PADDING_TYPE;
-    param->stride_width = 1;
-    param->stride_height = 1;
-    param->filter_width = KER_W;
-    param->filter_height = KER_H;
-    param->activation = kTfLiteActNone;
-
-    // Run Convolution and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_MAX_POOL_2D, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/max_pool_quan_1.lst b/tools/nnapi_quickcheck/tests/max_pool_quan_1.lst
deleted file mode 100644
index 4b5c130..0000000
--- a/tools/nnapi_quickcheck/tests/max_pool_quan_1.lst
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(KER_N, 1)
-INT_VALUE(KER_H, 3)
-INT_VALUE(KER_W, 4)
-
-INT_VALUE(OFM_H, 1)
-INT_VALUE(OFM_W, 1)
-
-// Default is kTfLitePaddingValid (= 2)
-INT_VALUE(PADDING_TYPE, 2)
diff --git a/tools/nnapi_quickcheck/tests/mul_1.cpp b/tools/nnapi_quickcheck/tests/mul_1.cpp
deleted file mode 100644
index 57ab713..0000000
--- a/tools/nnapi_quickcheck/tests/mul_1.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_mul_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "mul_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_1D = LEFT_1D_Value();
-  const int32_t LEFT_2D = LEFT_2D_Value();
-  const int32_t LEFT_3D = LEFT_3D_Value();
-
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_1D = LEFT_1D_Value();
-  const int32_t OFM_2D = LEFT_2D_Value();
-  const int32_t OFM_3D = LEFT_3D_Value();
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_1D);
-  PRINT_VALUE(LEFT_2D);
-  PRINT_VALUE(LEFT_3D);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_1D);
-  PRINT_VALUE(OFM_2D);
-  PRINT_VALUE(OFM_3D);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_1D, OFM_2D, OFM_3D} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_1D, LEFT_2D, LEFT_3D} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_W} /* dims */, quantization);
-
-    // Add MUL Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run MUL and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_MUL, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-  param.tensor_logging = 1;
-  param.log_path = "report/tensor_mul_1.log";
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/mul_1.lst b/tools/nnapi_quickcheck/tests/mul_1.lst
deleted file mode 100644
index 1d42159..0000000
--- a/tools/nnapi_quickcheck/tests/mul_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-// (3, 1, 4)
-INT_VALUE(LEFT_1D, 3)
-INT_VALUE(LEFT_2D, 1)
-INT_VALUE(LEFT_3D, 4)
-
-INT_VALUE(RIGHT_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/mul_2.cpp b/tools/nnapi_quickcheck/tests/mul_2.cpp
deleted file mode 100644
index a692616..0000000
--- a/tools/nnapi_quickcheck/tests/mul_2.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_mul_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "mul_2.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_D1 = LEFT_D1_Value();
-  const int32_t LEFT_D2 = LEFT_D2_Value();
-  const int32_t LEFT_D3 = LEFT_D3_Value();
-
-  const int32_t RIGHT_D1 = RIGHT_D1_Value();
-
-  const int32_t OFM_D1 = LEFT_D1;
-  const int32_t OFM_D2 = LEFT_D2;
-  const int32_t OFM_D3 = LEFT_D3;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_D1);
-  PRINT_VALUE(LEFT_D2);
-  PRINT_VALUE(LEFT_D3);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_D1);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_D1);
-  PRINT_VALUE(OFM_D2);
-  PRINT_VALUE(OFM_D3);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-
-    quantization.scale = 1;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_D1, OFM_D2, OFM_D3} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_D1, LEFT_D2, LEFT_D3} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_D1} /* dims */, quantization);
-
-    // Add Convolution Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Add and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Left from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_MUL, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/mul_2.lst b/tools/nnapi_quickcheck/tests/mul_2.lst
deleted file mode 100644
index da53e7e..0000000
--- a/tools/nnapi_quickcheck/tests/mul_2.lst
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_D1, 5)
-INT_VALUE(LEFT_D2, 3)
-INT_VALUE(LEFT_D3, 12)
-
-INT_VALUE(RIGHT_D1, 12)
diff --git a/tools/nnapi_quickcheck/tests/mul_quan_1.cpp b/tools/nnapi_quickcheck/tests/mul_quan_1.cpp
deleted file mode 100644
index 5f0061e..0000000
--- a/tools/nnapi_quickcheck/tests/mul_quan_1.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_mul_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "mul_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_1D = LEFT_1D_Value();
-  const int32_t LEFT_2D = LEFT_2D_Value();
-  const int32_t LEFT_3D = LEFT_3D_Value();
-
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_1D = LEFT_1D_Value();
-  const int32_t OFM_2D = LEFT_2D_Value();
-  const int32_t OFM_3D = LEFT_3D_Value();
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_1D);
-  PRINT_VALUE(LEFT_2D);
-  PRINT_VALUE(LEFT_3D);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_1D);
-  PRINT_VALUE(OFM_2D);
-  PRINT_VALUE(OFM_3D);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    float max_scale =
-        std::numeric_limits<uint8_t>::max(); // * input1_scale(1.0f) * input2_scale(1.0f)
-    quantization.scale = max_scale;
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_1D, OFM_2D, OFM_3D} /* dims */, quantization);
-
-    // Configure input(s)
-    quantization.scale = 1.0f;
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "left" /* name */,
-                                        {LEFT_1D, LEFT_2D, LEFT_3D} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteUInt8 /* type */, "right" /* name */,
-                                        {RIGHT_W} /* dims */, quantization);
-
-    // Add MUL Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run MUL and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_MUL, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/mul_quan_1.lst b/tools/nnapi_quickcheck/tests/mul_quan_1.lst
deleted file mode 100644
index d850f37..0000000
--- a/tools/nnapi_quickcheck/tests/mul_quan_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-// (300, 1, 4)
-INT_VALUE(LEFT_1D, 300)
-INT_VALUE(LEFT_2D, 1)
-INT_VALUE(LEFT_3D, 4)
-
-INT_VALUE(RIGHT_W, 4)
diff --git a/tools/nnapi_quickcheck/tests/relu1_1.cpp b/tools/nnapi_quickcheck/tests/relu1_1.cpp
deleted file mode 100644
index 25e71dc..0000000
--- a/tools/nnapi_quickcheck/tests/relu1_1.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-int main(int argc, char **argv)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu1_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU_N1_TO_1, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  return RandomTestRunner{SEED, param}.run(builder);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu1_1.lst b/tools/nnapi_quickcheck/tests/relu1_1.lst
deleted file mode 100644
index 4f61845..0000000
--- a/tools/nnapi_quickcheck/tests/relu1_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/relu6_1.cpp b/tools/nnapi_quickcheck/tests/relu6_1.cpp
deleted file mode 100644
index 43e8383..0000000
--- a/tools/nnapi_quickcheck/tests/relu6_1.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_relu6_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu6_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU6, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu6_1.lst b/tools/nnapi_quickcheck/tests/relu6_1.lst
deleted file mode 100644
index 4f61845..0000000
--- a/tools/nnapi_quickcheck/tests/relu6_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/relu6_quan_1.cpp b/tools/nnapi_quickcheck/tests/relu6_quan_1.cpp
deleted file mode 100644
index 8356442..0000000
--- a/tools/nnapi_quickcheck/tests/relu6_quan_1.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-int main(int argc, char **argv)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu6_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU6, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  return RandomTestRunner{SEED, param}.run(builder);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu6_quan_1.lst b/tools/nnapi_quickcheck/tests/relu6_quan_1.lst
deleted file mode 100644
index 4f61845..0000000
--- a/tools/nnapi_quickcheck/tests/relu6_quan_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/relu_1.cpp b/tools/nnapi_quickcheck/tests/relu_1.cpp
deleted file mode 100644
index decd0dd..0000000
--- a/tools/nnapi_quickcheck/tests/relu_1.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_relu_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu_1.lst b/tools/nnapi_quickcheck/tests/relu_1.lst
deleted file mode 100644
index 4f61845..0000000
--- a/tools/nnapi_quickcheck/tests/relu_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/relu_2.cpp b/tools/nnapi_quickcheck/tests/relu_2.cpp
deleted file mode 100644
index ccb9f06..0000000
--- a/tools/nnapi_quickcheck/tests/relu_2.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_relu_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu_2.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu_2.lst b/tools/nnapi_quickcheck/tests/relu_2.lst
deleted file mode 100644
index 343bff8..0000000
--- a/tools/nnapi_quickcheck/tests/relu_2.lst
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
-INT_VALUE(IFM_C, 3)
diff --git a/tools/nnapi_quickcheck/tests/relu_3.cpp b/tools/nnapi_quickcheck/tests/relu_3.cpp
deleted file mode 100644
index 59a8560..0000000
--- a/tools/nnapi_quickcheck/tests/relu_3.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_relu_3, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu_3.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu_3.lst b/tools/nnapi_quickcheck/tests/relu_3.lst
deleted file mode 100644
index a3a405c..0000000
--- a/tools/nnapi_quickcheck/tests/relu_3.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_N, 1)
diff --git a/tools/nnapi_quickcheck/tests/relu_quan_1.cpp b/tools/nnapi_quickcheck/tests/relu_quan_1.cpp
deleted file mode 100644
index 303080e..0000000
--- a/tools/nnapi_quickcheck/tests/relu_quan_1.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-int main(int argc, char **argv)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "relu_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Add ReLU Node
-    // Run ReLU and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RELU, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  return RandomTestRunner{SEED, param}.run(builder);
-}
diff --git a/tools/nnapi_quickcheck/tests/relu_quan_1.lst b/tools/nnapi_quickcheck/tests/relu_quan_1.lst
deleted file mode 100644
index 4f61845..0000000
--- a/tools/nnapi_quickcheck/tests/relu_quan_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 16)
-INT_VALUE(IFM_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/reshape_1.cpp b/tools/nnapi_quickcheck/tests/reshape_1.cpp
deleted file mode 100644
index 54cfce2..0000000
--- a/tools/nnapi_quickcheck/tests/reshape_1.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_reshape_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "max_pool_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OUT_L = IFM_C * IFM_H * IFM_W;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUT_L);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t dims[2] = {1, OUT_L};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    // A: This may be necessary, because quantization values(scale, zero_point) of TENSOR_INT32 and
-    // TENSOR_QUANT8_ASYMM are passed on to the runtime.
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OUT_L} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Configure Shape
-    interp.SetTensorParametersReadOnly(2, kTfLiteInt32 /* type */, "shape" /* name */,
-                                       {2} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(dims), 2 * sizeof(int32_t));
-
-    // Add Reshape Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteReshapeParams>();
-
-    param->num_dimensions = 2;
-    param->shape[0] = 1;
-    param->shape[1] = OUT_L;
-
-    // Run Reshapeand store its result into Tensor #0
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RESHAPE, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/reshape_1.lst b/tools/nnapi_quickcheck/tests/reshape_1.lst
deleted file mode 100644
index fcaaff0..0000000
--- a/tools/nnapi_quickcheck/tests/reshape_1.lst
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 4)
-INT_VALUE(IFM_W, 8)
diff --git a/tools/nnapi_quickcheck/tests/reshape_quan_1.cpp b/tools/nnapi_quickcheck/tests/reshape_quan_1.cpp
deleted file mode 100644
index 8eb0bf3..0000000
--- a/tools/nnapi_quickcheck/tests/reshape_quan_1.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_reshape_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "reshape_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OUT_L = IFM_C * IFM_H * IFM_W;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUT_L);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t dims[2] = {1, OUT_L};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    // A: This may be necessary, because quantization values(scale, zero_point) of TENSOR_INT32 and
-    // TENSOR_QUANT8_ASYMM are passed on to the runtime.
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1 /*N*/, OUT_L} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Configure Shape
-    interp.SetTensorParametersReadOnly(2, kTfLiteInt32 /* type */, "shape" /* name */,
-                                       {2} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(dims), 2 * sizeof(int32_t));
-
-    // Add Reshape Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteReshapeParams>();
-
-    param->num_dimensions = 2;
-    param->shape[0] = 1;
-    param->shape[1] = OUT_L;
-
-    // Run Reshapeand store its result into Tensor #0
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RESHAPE, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/reshape_quan_1.lst b/tools/nnapi_quickcheck/tests/reshape_quan_1.lst
deleted file mode 100644
index fcaaff0..0000000
--- a/tools/nnapi_quickcheck/tests/reshape_quan_1.lst
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 4)
-INT_VALUE(IFM_W, 8)
diff --git a/tools/nnapi_quickcheck/tests/resize_bilinear_1.cpp b/tools/nnapi_quickcheck/tests/resize_bilinear_1.cpp
deleted file mode 100644
index 5b2d7b6..0000000
--- a/tools/nnapi_quickcheck/tests/resize_bilinear_1.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_resize_bilinear_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "resize_bilinear_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = OFM_H_Value();
-  const int32_t OFM_W = OFM_W_Value();
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  int32_t size_data[2] = {OFM_H, OFM_W};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    // A: This may be necessary, because quantization values(scale, zero_point) of TENSOR_INT32 and
-    // TENSOR_QUANT8_ASYMM are passed on to the runtime.
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure OFM
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure IFM
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Configure Size
-    interp.SetTensorParametersReadOnly(
-        2, kTfLiteInt32 /* type */, "size" /* name */, {2} /* dims */, quantization,
-        reinterpret_cast<const char *>(size_data), 2 * sizeof(int32_t));
-
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteResizeBilinearParams>();
-
-    // NOTE What is this?
-    param->align_corners = false;
-
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_RESIZE_BILINEAR, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/resize_bilinear_1.lst b/tools/nnapi_quickcheck/tests/resize_bilinear_1.lst
deleted file mode 100644
index cc3dbd5..0000000
--- a/tools/nnapi_quickcheck/tests/resize_bilinear_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_C, 2)
-INT_VALUE(IFM_H, 3)
-INT_VALUE(IFM_W, 4)
-
-INT_VALUE(OFM_H, 30)
-INT_VALUE(OFM_W, 40)
diff --git a/tools/nnapi_quickcheck/tests/softmax_1.cpp b/tools/nnapi_quickcheck/tests/softmax_1.cpp
deleted file mode 100644
index 7142475..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_1.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_softmax_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "softmax_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = 1;
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  const nnfw::misc::feature::Shape ifm_shape{IFM_C, IFM_H, IFM_W};
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1, IFM_H * IFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1, IFM_H * IFM_W} /* batch_size, input_size */,
-                                        quantization);
-
-    // Add Softmax Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteSoftmaxParams>();
-
-    param->beta = 1.0;
-
-    // Run Softmax and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SOFTMAX, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/softmax_1.lst b/tools/nnapi_quickcheck/tests/softmax_1.lst
deleted file mode 100644
index 1ef9da0..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 2)
-INT_VALUE(IFM_W, 2)
diff --git a/tools/nnapi_quickcheck/tests/softmax_2.cpp b/tools/nnapi_quickcheck/tests/softmax_2.cpp
deleted file mode 100644
index df1ff27..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_2.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_softmax_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define FLOAT_VALUE(NAME, VALUE) FloatVar NAME##_Value(#NAME, VALUE);
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "softmax_2.lst"
-#undef INT_VALUE
-#undef FLOAT_VALUE
-
-  const int32_t IFM_C = 1;
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-  const float BETA = BETA_Value();
-
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_VALUE(BETA);
-  PRINT_NEWLINE();
-
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  const nnfw::misc::feature::Shape ifm_shape{IFM_C, IFM_H, IFM_W};
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {1, IFM_H * IFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {1, IFM_H * IFM_W} /* batch_size, input_size */,
-                                        quantization);
-
-    // Add Softmax Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteSoftmaxParams>();
-
-    param->beta = BETA;
-
-    // Run Softmax and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SOFTMAX, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/softmax_2.lst b/tools/nnapi_quickcheck/tests/softmax_2.lst
deleted file mode 100644
index 1c381bf..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_2.lst
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-#ifndef FLOAT_VALUE
-#error "FLOAT_VALUE should be defined"
-#endif // FLOAT_VALUE
-
-INT_VALUE(IFM_H, 2)
-INT_VALUE(IFM_W, 2)
-FLOAT_VALUE(BETA, 0.1)
diff --git a/tools/nnapi_quickcheck/tests/softmax_quan_1.cpp b/tools/nnapi_quickcheck/tests/softmax_quan_1.cpp
deleted file mode 100644
index 5d38f77..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_quan_1.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_softmax_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "softmax_quan_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_C = 1;
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  const nnfw::misc::feature::Shape ifm_shape{IFM_C, IFM_H, IFM_W};
-
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 1.0f / 256;
-    quantization.zero_point = 0;
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure Output Tensor
-    interp.SetTensorParametersReadWrite(0, kTfLiteUInt8 /* type */, "output" /* name */,
-                                        {1, IFM_H * IFM_W} /* dims */, quantization);
-
-    // Configure Input Tensor
-    interp.SetTensorParametersReadWrite(1, kTfLiteUInt8 /* type */, "input" /* name */,
-                                        {1, IFM_H * IFM_W} /* batch_size, input_size */,
-                                        quantization);
-
-    // Add Softmax Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteSoftmaxParams>();
-
-    param->beta = 1.0;
-
-    // Run Softmax and store its result into Tensor #0
-    //  - Read IFM from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SOFTMAX, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/softmax_quan_1.lst b/tools/nnapi_quickcheck/tests/softmax_quan_1.lst
deleted file mode 100644
index 1ef9da0..0000000
--- a/tools/nnapi_quickcheck/tests/softmax_quan_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 2)
-INT_VALUE(IFM_W, 2)
diff --git a/tools/nnapi_quickcheck/tests/split_1.cpp b/tools/nnapi_quickcheck/tests/split_1.cpp
deleted file mode 100644
index 95a7aa8..0000000
--- a/tools/nnapi_quickcheck/tests/split_1.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_split_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "split_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-  const int32_t NUM_SPLIT = NUM_SPLIT_Value();
-  const int32_t AXIS = AXIS_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_VALUE(NUM_SPLIT);
-  PRINT_VALUE(AXIS);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-  const int32_t axis[1] = {AXIS};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(NUM_SPLIT + 2);
-
-    // Configure Input Tensor(s)
-    interp.SetTensorParametersReadOnly(0, kTfLiteInt32 /* type */, "axis" /* name */,
-                                       {1} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(axis), 1 * sizeof(int32_t));
-
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Configure Output Tensor
-    std::vector<int> ofm_indexes;
-
-    for (uint32_t n = 0; n < NUM_SPLIT; ++n)
-    {
-      const auto ofm_index = 2 + n;
-
-      interp.SetTensorParametersReadWrite(ofm_index, kTfLiteFloat32 /* type */, "output" /* name */,
-                                          {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-      ofm_indexes.emplace_back(ofm_index);
-    }
-
-    auto *param = reinterpret_cast<TfLiteSplitParams *>(malloc(sizeof(TfLiteSplitParams)));
-
-    param->num_splits = NUM_SPLIT;
-
-    // Add SPLIT Node
-    // Run SPLIT and store its result into Tensor #0
-    //  - Read axis and IFM from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, ofm_indexes, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SPLIT, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #2 ~ #NUM_SPLIT+1 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs(ofm_indexes);
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/split_1.lst b/tools/nnapi_quickcheck/tests/split_1.lst
deleted file mode 100644
index 823bf24..0000000
--- a/tools/nnapi_quickcheck/tests/split_1.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 1)
-INT_VALUE(IFM_H, 5)
-INT_VALUE(IFM_W, 30)
-INT_VALUE(NUM_SPLIT, 5)
-INT_VALUE(AXIS, 1)
diff --git a/tools/nnapi_quickcheck/tests/split_2.cpp b/tools/nnapi_quickcheck/tests/split_2.cpp
deleted file mode 100644
index eb06ea0..0000000
--- a/tools/nnapi_quickcheck/tests/split_2.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_split_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "split_2.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-  const int32_t NUM_SPLIT = NUM_SPLIT_Value();
-  const int32_t AXIS = AXIS_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_VALUE(NUM_SPLIT);
-  PRINT_VALUE(AXIS);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-  const int32_t axis[1] = {AXIS};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(NUM_SPLIT + 2);
-
-    // Configure Input Tensor(s)
-    interp.SetTensorParametersReadOnly(0, kTfLiteInt32 /* type */, "axis" /* name */,
-                                       {1} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(axis), 1 * sizeof(int32_t));
-
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Configure Output Tensor
-    std::vector<int> ofm_indexes;
-
-    for (uint32_t n = 0; n < NUM_SPLIT; ++n)
-    {
-      const auto ofm_index = 2 + n;
-
-      interp.SetTensorParametersReadWrite(ofm_index, kTfLiteFloat32 /* type */, "output" /* name */,
-                                          {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-      ofm_indexes.emplace_back(ofm_index);
-    }
-
-    auto *param = reinterpret_cast<TfLiteSplitParams *>(malloc(sizeof(TfLiteSplitParams)));
-
-    param->num_splits = NUM_SPLIT;
-
-    // Add SPLIT Node
-    // Run SPLIT and store its result into Tensor #0
-    //  - Read axis and IFM from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, ofm_indexes, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SPLIT, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #2 ~ #NUM_SPLIT+1 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs(ofm_indexes);
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/split_2.lst b/tools/nnapi_quickcheck/tests/split_2.lst
deleted file mode 100644
index ebfbab2..0000000
--- a/tools/nnapi_quickcheck/tests/split_2.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 1)
-INT_VALUE(IFM_H, 5)
-INT_VALUE(IFM_W, 30)
-INT_VALUE(NUM_SPLIT, 3)
-INT_VALUE(AXIS, 2)
diff --git a/tools/nnapi_quickcheck/tests/split_3.cpp b/tools/nnapi_quickcheck/tests/split_3.cpp
deleted file mode 100644
index e3beb5b..0000000
--- a/tools/nnapi_quickcheck/tests/split_3.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_split_3, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "split_3.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-  const int32_t NUM_SPLIT = NUM_SPLIT_Value();
-  const int32_t AXIS = AXIS_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_VALUE(NUM_SPLIT);
-  PRINT_VALUE(AXIS);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-  const int32_t axis[1] = {AXIS};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(NUM_SPLIT + 2);
-
-    // Configure Input Tensor(s)
-    interp.SetTensorParametersReadOnly(0, kTfLiteInt32 /* type */, "axis" /* name */,
-                                       {1} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(axis), 1 * sizeof(int32_t));
-
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Configure Output Tensor
-    std::vector<int> ofm_indexes;
-
-    for (uint32_t n = 0; n < NUM_SPLIT; ++n)
-    {
-      const auto ofm_index = 2 + n;
-
-      interp.SetTensorParametersReadWrite(ofm_index, kTfLiteFloat32 /* type */, "output" /* name */,
-                                          {OFM_H, OFM_W} /* dims */, quantization);
-
-      ofm_indexes.emplace_back(ofm_index);
-    }
-
-    auto *param = reinterpret_cast<TfLiteSplitParams *>(malloc(sizeof(TfLiteSplitParams)));
-
-    param->num_splits = NUM_SPLIT;
-
-    // Add SPLIT Node
-    // Run SPLIT and store its result into Tensor #0
-    //  - Read axis and IFM from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, ofm_indexes, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SPLIT, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #2 ~ #NUM_SPLIT+1 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs(ofm_indexes);
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/split_3.lst b/tools/nnapi_quickcheck/tests/split_3.lst
deleted file mode 100644
index 300bb02..0000000
--- a/tools/nnapi_quickcheck/tests/split_3.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 5)
-INT_VALUE(IFM_W, 30)
-INT_VALUE(NUM_SPLIT, 3)
-INT_VALUE(AXIS, 1)
diff --git a/tools/nnapi_quickcheck/tests/split_4.cpp b/tools/nnapi_quickcheck/tests/split_4.cpp
deleted file mode 100644
index e098973..0000000
--- a/tools/nnapi_quickcheck/tests/split_4.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-#include "misc/feature/Shape.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <random>
-#include <iostream>
-#include <cassert>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_split_4, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "split_4.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-  const int32_t NUM_SPLIT = NUM_SPLIT_Value();
-  const int32_t AXIS = AXIS_Value();
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_VALUE(NUM_SPLIT);
-  PRINT_VALUE(AXIS);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-  const int32_t axis[1] = {AXIS};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(NUM_SPLIT + 2);
-
-    // Configure Input Tensor(s)
-    interp.SetTensorParametersReadOnly(0, kTfLiteInt32 /* type */, "axis" /* name */,
-                                       {1} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(axis), 1 * sizeof(int32_t));
-
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_H, IFM_W} /* dims */, quantization);
-
-    // Configure Output Tensor
-    std::vector<int> ofm_indexes;
-
-    for (uint32_t n = 0; n < NUM_SPLIT; ++n)
-    {
-      const auto ofm_index = 2 + n;
-
-      interp.SetTensorParametersReadWrite(ofm_index, kTfLiteFloat32 /* type */, "output" /* name */,
-                                          {OFM_H, OFM_W} /* dims */, quantization);
-
-      ofm_indexes.emplace_back(ofm_index);
-    }
-
-    auto *param = reinterpret_cast<TfLiteSplitParams *>(malloc(sizeof(TfLiteSplitParams)));
-
-    param->num_splits = NUM_SPLIT;
-
-    // Add SPLIT Node
-    // Run SPLIT and store its result into Tensor #0
-    //  - Read axis and IFM from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, ofm_indexes, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SPLIT, 1));
-
-    // Set Tensor #1 as Input #0, and Tensor #2 ~ #NUM_SPLIT+1 as Output #0
-    interp.SetInputs({1});
-    interp.SetOutputs(ofm_indexes);
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/split_4.lst b/tools/nnapi_quickcheck/tests/split_4.lst
deleted file mode 100644
index 5b28828..0000000
--- a/tools/nnapi_quickcheck/tests/split_4.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_H, 5)
-INT_VALUE(IFM_W, 30)
-INT_VALUE(NUM_SPLIT, 5)
-INT_VALUE(AXIS, 0)
diff --git a/tools/nnapi_quickcheck/tests/sub_1.cpp b/tools/nnapi_quickcheck/tests/sub_1.cpp
deleted file mode 100644
index 8bc4208..0000000
--- a/tools/nnapi_quickcheck/tests/sub_1.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                        quantization);
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_1.lst b/tools/nnapi_quickcheck/tests/sub_1.lst
deleted file mode 100644
index fa17cae..0000000
--- a/tools/nnapi_quickcheck/tests/sub_1.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 16)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/sub_2.cpp b/tools/nnapi_quickcheck/tests/sub_2.cpp
deleted file mode 100644
index 423e105..0000000
--- a/tools/nnapi_quickcheck/tests/sub_2.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_2, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_2.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT = RIGHT_Value();
-
-  const int32_t OFM_N = LEFT_N;
-  const int32_t OFM_C = LEFT_C;
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT} /* dims */, quantization);
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_2.lst b/tools/nnapi_quickcheck/tests/sub_2.lst
deleted file mode 100644
index cd36ac1..0000000
--- a/tools/nnapi_quickcheck/tests/sub_2.lst
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 16)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT, 1)
diff --git a/tools/nnapi_quickcheck/tests/sub_3.cpp b/tools/nnapi_quickcheck/tests/sub_3.cpp
deleted file mode 100644
index 7bb6ab4..0000000
--- a/tools/nnapi_quickcheck/tests/sub_3.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_3, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_3.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT = RIGHT_Value();
-
-  const int32_t OFM_H = LEFT_H;
-  const int32_t OFM_W = LEFT_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_H, LEFT_W} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT, LEFT_W} /* dims */, quantization);
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_3.lst b/tools/nnapi_quickcheck/tests/sub_3.lst
deleted file mode 100644
index c568750..0000000
--- a/tools/nnapi_quickcheck/tests/sub_3.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_H, 8)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT, 1)
diff --git a/tools/nnapi_quickcheck/tests/sub_4.cpp b/tools/nnapi_quickcheck/tests/sub_4.cpp
deleted file mode 100644
index 7fc8577..0000000
--- a/tools/nnapi_quickcheck/tests/sub_4.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_4, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_1.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_C = LEFT_C_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-
-  const int32_t RIGHT_C = RIGHT_C_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_C);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_C);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                        {LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization);
-
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                        {RIGHT_H, RIGHT_W, RIGHT_C} /* dims */, quantization);
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({1, 2});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_4.lst b/tools/nnapi_quickcheck/tests/sub_4.lst
deleted file mode 100644
index ce6128f..0000000
--- a/tools/nnapi_quickcheck/tests/sub_4.lst
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_C, 3)
-INT_VALUE(LEFT_H, 8)
-INT_VALUE(LEFT_W, 16)
-
-INT_VALUE(RIGHT_C, 3)
-INT_VALUE(RIGHT_H, 1)
-INT_VALUE(RIGHT_W, 16)
diff --git a/tools/nnapi_quickcheck/tests/sub_5.cpp b/tools/nnapi_quickcheck/tests/sub_5.cpp
deleted file mode 100644
index 19f95b6..0000000
--- a/tools/nnapi_quickcheck/tests/sub_5.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_5, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_5.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_VALUE(LEFT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-  PRINT_VALUE(OFM_C);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure left data
-  const uint32_t left_size = LEFT_N * LEFT_C * LEFT_H * LEFT_W;
-  const uint32_t right_size = RIGHT_N * RIGHT_C * RIGHT_H * RIGHT_W;
-  float left_data[left_size] = {
-      0.0f,
-  };
-  float right_data[right_size] = {
-      0.0f,
-  };
-
-  // Fill left data with random data
-  {
-    std::normal_distribution<float> left_dist(-1.0f, +1.0f);
-    float value = 10.0f;
-    for (uint32_t off = 0; off < left_size; ++off)
-    {
-      left_data[off] = value;
-    }
-    value = 1.0f;
-    for (uint32_t off = 0; off < right_size; ++off)
-    {
-      right_data[off] = value++;
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                       {LEFT_N, LEFT_H, LEFT_W, LEFT_C} /* dims */, quantization,
-                                       reinterpret_cast<const char *>(left_data),
-                                       left_size * sizeof(float));
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                       {RIGHT_W, RIGHT_C} /* dims: test with other shapes */,
-                                       quantization, reinterpret_cast<const char *>(right_data),
-                                       right_size * sizeof(float));
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_5.lst b/tools/nnapi_quickcheck/tests/sub_5.lst
deleted file mode 100644
index 0327e6b..0000000
--- a/tools/nnapi_quickcheck/tests/sub_5.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_H, 2)
-INT_VALUE(LEFT_W, 3)
-INT_VALUE(LEFT_C, 4)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_H, 1)
-INT_VALUE(RIGHT_W, 3)
-INT_VALUE(RIGHT_C, 4)
diff --git a/tools/nnapi_quickcheck/tests/sub_6.cpp b/tools/nnapi_quickcheck/tests/sub_6.cpp
deleted file mode 100644
index 66b167e..0000000
--- a/tools/nnapi_quickcheck/tests/sub_6.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_sub_6, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "sub_6.lst"
-#undef INT_VALUE
-
-  const int32_t LEFT_N = LEFT_N_Value();
-  const int32_t LEFT_H = LEFT_H_Value();
-  const int32_t LEFT_W = LEFT_W_Value();
-  const int32_t LEFT_C = LEFT_C_Value();
-
-  const int32_t RIGHT_N = RIGHT_N_Value();
-  const int32_t RIGHT_H = RIGHT_H_Value();
-  const int32_t RIGHT_W = RIGHT_W_Value();
-  const int32_t RIGHT_C = RIGHT_C_Value();
-
-  const int32_t OFM_N = std::max(LEFT_N, RIGHT_N);
-  const int32_t OFM_H = std::max(LEFT_H, RIGHT_H);
-  const int32_t OFM_W = std::max(LEFT_W, RIGHT_W);
-  const int32_t OFM_C = std::max(LEFT_C, RIGHT_C);
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(LEFT_N);
-  PRINT_VALUE(LEFT_H);
-  PRINT_VALUE(LEFT_W);
-  PRINT_VALUE(LEFT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(RIGHT_N);
-  PRINT_VALUE(RIGHT_H);
-  PRINT_VALUE(RIGHT_W);
-  PRINT_VALUE(RIGHT_C);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-  PRINT_VALUE(OFM_C);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Configure left data
-  const uint32_t left_size = LEFT_N * LEFT_C * LEFT_H * LEFT_W;
-  const uint32_t right_size = RIGHT_N * RIGHT_C * RIGHT_H * RIGHT_W;
-  float left_data[left_size] = {
-      0.0f,
-  };
-  float right_data[right_size] = {
-      0.0f,
-  };
-
-  // Fill left data with random data
-  {
-    std::normal_distribution<float> left_dist(-1.0f, +1.0f);
-    float value = 10.0f;
-    for (uint32_t off = 0; off < left_size; ++off)
-    {
-      left_data[off] = value;
-    }
-    value = 1.0f;
-    for (uint32_t off = 0; off < right_size; ++off)
-    {
-      right_data[off] = value++;
-    }
-  }
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(3);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(1, kTfLiteFloat32 /* type */, "left" /* name */,
-                                       {LEFT_W, LEFT_C} /* dims: test with other shapes */,
-                                       quantization, reinterpret_cast<const char *>(left_data),
-                                       left_size * sizeof(float));
-
-    // Configure input(s)
-    interp.SetTensorParametersReadOnly(2, kTfLiteFloat32 /* type */, "right" /* name */,
-                                       {RIGHT_N, RIGHT_H, RIGHT_W, RIGHT_C} /* dims */,
-                                       quantization, reinterpret_cast<const char *>(right_data),
-                                       right_size * sizeof(float));
-
-    // Add Subtraction Node
-    //
-    // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
-    //      So, param should be allocated with malloc
-    auto param = make_alloc<TfLiteAddParams>();
-
-    param->activation = kTfLiteActNone;
-
-    // Run Sub and store the result into Tensor #0
-    //  - Read Left from Tensor #1
-    //  - Read Right from Tensor #2,
-    interp.AddNodeWithParameters({1, 2}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_SUB, 1));
-
-    interp.SetInputs({});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/sub_6.lst b/tools/nnapi_quickcheck/tests/sub_6.lst
deleted file mode 100644
index 52a1f1a..0000000
--- a/tools/nnapi_quickcheck/tests/sub_6.lst
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(LEFT_N, 1)
-INT_VALUE(LEFT_H, 1)
-INT_VALUE(LEFT_W, 3)
-INT_VALUE(LEFT_C, 4)
-
-INT_VALUE(RIGHT_N, 1)
-INT_VALUE(RIGHT_H, 2)
-INT_VALUE(RIGHT_W, 3)
-INT_VALUE(RIGHT_C, 4)
diff --git a/tools/nnapi_quickcheck/tests/tanh_1.cpp b/tools/nnapi_quickcheck/tests/tanh_1.cpp
deleted file mode 100644
index 7dd9261..0000000
--- a/tools/nnapi_quickcheck/tests/tanh_1.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <iostream>
-#include <cassert>
-
-#include <chrono>
-#include <random>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_tanh_1, simple_test)
-{
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "tanh_1.lst"
-#undef INT_VALUE
-
-  const int32_t IFM_N = IFM_N_Value();
-  const int32_t IFM_C = IFM_C_Value();
-  const int32_t IFM_H = IFM_H_Value();
-  const int32_t IFM_W = IFM_W_Value();
-
-  const int32_t OFM_N = IFM_N;
-  const int32_t OFM_C = IFM_C;
-  const int32_t OFM_H = IFM_H;
-  const int32_t OFM_W = IFM_W;
-
-  // Initialize random number generator
-  std::minstd_rand random(SEED);
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(IFM_N);
-  PRINT_VALUE(IFM_C);
-  PRINT_VALUE(IFM_H);
-  PRINT_VALUE(IFM_W);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OFM_N);
-  PRINT_VALUE(OFM_C);
-  PRINT_VALUE(OFM_H);
-  PRINT_VALUE(OFM_W);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(2);
-
-    // Configure output
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
-                                        {OFM_N, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
-
-    // Configure input
-    interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {IFM_N, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
-
-    // Add Tanh Node
-    // Run Tanh and store the result into Tensor #0
-    //  - Read input from Tensor #1
-    interp.AddNodeWithParameters({1}, {0}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_TANH, 1));
-
-    interp.SetInputs({1});
-    interp.SetOutputs({0});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/tanh_1.lst b/tools/nnapi_quickcheck/tests/tanh_1.lst
deleted file mode 100644
index a0077cb..0000000
--- a/tools/nnapi_quickcheck/tests/tanh_1.lst
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(IFM_N, 1)
-INT_VALUE(IFM_C, 3)
-INT_VALUE(IFM_H, 320)
-INT_VALUE(IFM_W, 320)
diff --git a/tools/nnapi_quickcheck/tests/topk_v2_1.cpp b/tools/nnapi_quickcheck/tests/topk_v2_1.cpp
deleted file mode 100644
index c47af57..0000000
--- a/tools/nnapi_quickcheck/tests/topk_v2_1.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "tflite/ext/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/builtin_op_data.h"
-
-#include "env.h"
-#include "memory.h"
-#include "misc/environment.h"
-
-#include "tflite/Diff.h"
-#include "tflite/Quantization.h"
-#include "tflite/interp/FunctionBuilder.h"
-
-#include <chrono>
-#include <iostream>
-
-using namespace tflite;
-using namespace nnfw::tflite;
-
-TEST(NNAPI_Quickcheck_topk_v2_1, simple_test)
-{
-  // Set random seed
-  int SEED = std::chrono::system_clock::now().time_since_epoch().count();
-
-  nnfw::misc::env::IntAccessor("SEED").access(SEED);
-
-  // Set random test parameters
-  int verbose = 0;
-  int tolerance = 1;
-
-  nnfw::misc::env::IntAccessor("VERBOSE").access(verbose);
-  nnfw::misc::env::IntAccessor("TOLERANCE").access(tolerance);
-
-#define INT_VALUE(NAME, VALUE) IntVar NAME##_Value(#NAME, VALUE);
-#include "topk_v2_1.lst"
-#undef INT_VALUE
-
-  const int32_t INPUT_DATA = INPUT_DATA_Value();
-  const int32_t K = K_Value();
-
-  const int32_t OUTPUT_VALUES = K;
-  const int32_t OUTPUT_INDICES = K;
-
-  std::cout << "Configurations:" << std::endl;
-#define PRINT_NEWLINE()     \
-  {                         \
-    std::cout << std::endl; \
-  }
-#define PRINT_VALUE(value)                                       \
-  {                                                              \
-    std::cout << "  " << #value << ": " << (value) << std::endl; \
-  }
-  PRINT_VALUE(SEED);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(INPUT_DATA);
-  PRINT_VALUE(K);
-  PRINT_NEWLINE();
-
-  PRINT_VALUE(OUTPUT_VALUES);
-  PRINT_VALUE(OUTPUT_INDICES);
-#undef PRINT_VALUE
-#undef PRINT_NEWLINE
-
-  // Fill the K data
-  int32_t k_data[1] = {K};
-
-  auto setup = [&](Interpreter &interp) {
-    // Comment from 'context.h'
-    //
-    // Parameters for asymmetric quantization. Quantized values can be converted
-    // back to float using:
-    //    real_value = scale * (quantized_value - zero_point);
-    //
-    // Q: Is this necessary?
-    // A: This may be necessary, because quantization values(scale, zero_point) of TENSOR_INT32 and
-    // TENSOR_QUANT8_ASYMM are passed on to the runtime.
-    TfLiteQuantizationParams quantization = make_default_quantization();
-
-    // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
-    interp.AddTensors(4);
-
-    // Configure INPUT_DATA
-    interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "input" /* name */,
-                                        {INPUT_DATA} /* dims */, quantization);
-
-    // Configure K
-    interp.SetTensorParametersReadOnly(1, kTfLiteInt32 /* type */, "k" /* name */, {1} /* dims */,
-                                       quantization, reinterpret_cast<const char *>(k_data),
-                                       sizeof(k_data));
-
-    // Configure OUTPUT_VALUES
-    interp.SetTensorParametersReadWrite(2, kTfLiteFloat32 /* type */, "output_values" /* name */,
-                                        {OUTPUT_VALUES} /* dims */, quantization);
-
-    // Configure OUTPUT_INDICES
-    interp.SetTensorParametersReadWrite(3, kTfLiteInt32 /* type */, "output_indices" /* name */,
-                                        {OUTPUT_INDICES} /* dims */, quantization);
-
-    // Add TopK_V2 Node
-    // Run TopK_V2 and store its result into Tensor #2 and #3
-    //  - Read input data and K from Tensor #0 and #1, respectively
-    interp.AddNodeWithParameters({0, 1}, {2, 3}, nullptr, 0, nullptr,
-                                 BuiltinOpResolver().FindOp(BuiltinOperator_TOPK_V2, 1));
-
-    // Set Tensor #0 as Input, and Tensor #2 and #3 as Output
-    interp.SetInputs({0});
-    interp.SetOutputs({2, 3});
-  };
-
-  const nnfw::tflite::FunctionBuilder builder(setup);
-
-  RandomTestParam param;
-
-  param.verbose = verbose;
-  param.tolerance = tolerance;
-
-  int res = RandomTestRunner{SEED, param}.run(builder);
-
-  EXPECT_EQ(res, 0);
-}
diff --git a/tools/nnapi_quickcheck/tests/topk_v2_1.lst b/tools/nnapi_quickcheck/tests/topk_v2_1.lst
deleted file mode 100644
index a40ee3c..0000000
--- a/tools/nnapi_quickcheck/tests/topk_v2_1.lst
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef INT_VALUE
-#error "INT_VALUE should be defined"
-#endif // INT_VALUE
-
-INT_VALUE(INPUT_DATA, 8192)
-INT_VALUE(K, 16)
diff --git a/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh b/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh
index 26f6c70..f4e223a 100755
--- a/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh
+++ b/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh
@@ -5,6 +5,8 @@ set -eu
 progname=$(basename "${BASH_SOURCE[0]}")
 outdir="."
 name=""
+config=""
+config_src=""
 
 usage() {
   echo "Usage: $progname [options] modelfile"
@@ -14,11 +16,13 @@ usage() {
   echo "    -h   show this help"
   echo "    -o   set nnpackage output directory (default=$outdir)"
   echo "    -p   set nnpackage output name (default=[modelfile name])"
+  echo "    -c   provide configuration file"
   echo ""
   echo "Examples:"
   echo "    $progname add.tflite                  => create nnpackage 'add' in $outdir/"
   echo "    $progname -o out add.tflite           => create nnpackage 'add' in out/"
   echo "    $progname -o out -p addpkg add.tflite => create nnpackage 'addpkg' in out/"
+  echo "    $progname -c add.cfg add.tflite       => create nnpackage 'add' with add.cfg"
   exit 1
 }
 
@@ -27,11 +31,12 @@ if [ $# -eq 0 ]; then
   exit 1
 fi
 
-while getopts "ho:p:" OPTION; do
+while getopts "ho:p:c:" OPTION; do
 case "${OPTION}" in
     h) usage;;
     o) outdir=$OPTARG;;
     p) name=$OPTARG;;
+    c) config_src=$OPTARG;;
     ?) exit 1;;
 esac
 done
@@ -64,11 +69,18 @@ extension=${modelfile##*.}
 
 echo "Generating nnpackage "$name" in "$outdir""
 mkdir -p "$outdir"/"$name"/metadata
+
+if [ -s "$config_src" ]; then
+  config=$(basename "$config_src")
+  cp "$config_src" "$outdir/$name/metadata/$config"
+fi
+
 cat > "$outdir"/"$name"/metadata/MANIFEST <<-EOF
 {
   "major-version" : "1",
-  "minor-version" : "0",
+  "minor-version" : "1",
   "patch-version" : "0",
+  "configs"     : [ "$config" ],
   "models"      : [ "$modelfile" ],
   "model-types" : [ "$extension" ]
 }
diff --git a/tools/tflite_accuracy/src/tflite_accuracy.cc b/tools/tflite_accuracy/src/tflite_accuracy.cc
index a532890..66c19a8 100644
--- a/tools/tflite_accuracy/src/tflite_accuracy.cc
+++ b/tools/tflite_accuracy/src/tflite_accuracy.cc
@@ -60,7 +60,7 @@ template <typename... Args> void Print(const char *fmt, Args... args)
 template <typename DataType> struct BaseLabelData
 {
   explicit BaseLabelData(int label = -1, DataType confidence = 0)
-      : label(label), confidence(confidence)
+    : label(label), confidence(confidence)
   {
   }
 
@@ -116,8 +116,8 @@ public:
   Runner(std::unique_ptr<tflite::Interpreter> interpreter,
          std::unique_ptr<tflite::FlatBufferModel> model,
          std::unique_ptr<::nnfw::tflite::NNAPIDelegate> delegate, unsigned img_size)
-      : interpreter(std::move(interpreter)), model(std::move(model)), delegate(std::move(delegate)),
-        interrupted(false), kInputSize(1 * img_size * img_size * 3 * sizeof(DataType))
+    : interpreter(std::move(interpreter)), model(std::move(model)), delegate(std::move(delegate)),
+      interrupted(false), kInputSize(1 * img_size * img_size * 3 * sizeof(DataType))
   {
     inference_times.reserve(500);
     top1.reserve(500);
@@ -308,7 +308,7 @@ public:
   FloatRunner(std::unique_ptr<tflite::Interpreter> interpreter,
               std::unique_ptr<tflite::FlatBufferModel> model,
               std::unique_ptr<::nnfw::tflite::NNAPIDelegate> delegate, unsigned img_size)
-      : Runner<float>(std::move(interpreter), std::move(model), std::move(delegate), img_size)
+    : Runner<float>(std::move(interpreter), std::move(model), std::move(delegate), img_size)
   {
   }
 
@@ -333,7 +333,7 @@ public:
   QuantizedRunner(std::unique_ptr<tflite::Interpreter> interpreter,
                   std::unique_ptr<tflite::FlatBufferModel> model,
                   std::unique_ptr<::nnfw::tflite::NNAPIDelegate> delegate, unsigned img_size)
-      : Runner<uint8_t>(std::move(interpreter), std::move(model), std::move(delegate), img_size)
+    : Runner<uint8_t>(std::move(interpreter), std::move(model), std::move(delegate), img_size)
   {
   }
 
@@ -411,12 +411,12 @@ std::unique_ptr<BaseRunner> MakeRunner(const std::string &model_path, unsigned i
   if (interpreter->tensor(input_index)->type == kTfLiteFloat32)
   {
     return std::unique_ptr<FloatRunner>(
-        new FloatRunner(std::move(interpreter), std::move(model), std::move(delegate), img_size));
+      new FloatRunner(std::move(interpreter), std::move(model), std::move(delegate), img_size));
   }
   else if (interpreter->tensor(input_index)->type == kTfLiteUInt8)
   {
-    return std::unique_ptr<QuantizedRunner>(new QuantizedRunner(
-        std::move(interpreter), std::move(model), std::move(delegate), img_size));
+    return std::unique_ptr<QuantizedRunner>(
+      new QuantizedRunner(std::move(interpreter), std::move(model), std::move(delegate), img_size));
   }
   throw std::invalid_argument("data type of model's input tensor is not supported.");
 }
@@ -424,13 +424,13 @@ std::unique_ptr<BaseRunner> MakeRunner(const std::string &model_path, unsigned i
 Target GetTarget(const std::string &str)
 {
   static const std::map<std::string, Target> target_names{
-      {"tflite-cpu", Target::TfLiteCpu},
-      {"tflite-delegate", Target::TfLiteDelegate},
-      {"nnfw-delegate", Target::NnfwDelegate}};
+    {"tflite-cpu", Target::TfLiteCpu},
+    {"tflite-delegate", Target::TfLiteDelegate},
+    {"nnfw-delegate", Target::NnfwDelegate}};
   if (target_names.find(str) == target_names.end())
   {
     throw std::invalid_argument(
-        str + ": invalid target. Run with --help for a list of available targets.");
+      str + ": invalid target. Run with --help for a list of available targets.");
   }
   return target_names.at(str);
 }
@@ -451,19 +451,22 @@ void HandleSigInt(int)
   }
 }
 
-int main(int argc, char *argv[]) try
+int main(int argc, char *argv[])
+try
 {
   namespace po = boost::program_options;
   po::options_description desc("Run a model on multiple binary images and print"
                                " statistics");
-  desc.add_options()("help", "print this message and quit")(
-      "model", po::value<std::string>()->default_value(kDefaultModelFile), "tflite file")(
-      "input", po::value<std::string>()->default_value(kDefaultImagesDir),
-      "directory with input images")("offset", po::value<int>()->default_value(1), "labels offset")(
-      "target", po::value<std::string>()->default_value("nnfw-delegate"),
-      "how the model will be run (available targets: tflite-cpu, "
-      "tflite-delegate, nnfw-delegate)")("imgsize", po::value<unsigned>()->default_value(224),
-                                         "the width and height of the image");
+  // clang-format off
+  desc.add_options()
+    ("help", "print this message and quit")
+    ("model", po::value<std::string>()->default_value(kDefaultModelFile), "tflite file")
+    ("input", po::value<std::string>()->default_value(kDefaultImagesDir), "directory with input images")
+    ("offset", po::value<int>()->default_value(1), "labels offset")
+    ("target", po::value<std::string>()->default_value("nnfw-delegate"),
+      "how the model will be run (available targets: tflite-cpu, tflite-delegate, nnfw-delegate)")
+    ("imgsize", po::value<unsigned>()->default_value(224), "the width and height of the image");
+  // clang-fomrat on
   po::variables_map vm;
   po::store(po::parse_command_line(argc, argv, desc), vm);
   if (vm.count("help"))
diff --git a/tools/tflitefile_tool/model_parser.py b/tools/tflitefile_tool/model_parser.py
index cd66bf5..ed534c1 100755
--- a/tools/tflitefile_tool/model_parser.py
+++ b/tools/tflitefile_tool/model_parser.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 # Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
 #