From 71e6642aa42bce4170f491e09a59ddcd35cd35e0 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=9F=D0=B0=D0=B2=D0=B5=D0=BB=20=D0=98=D0=BB=D1=8C=D1=8E?=
 =?utf8?q?=D1=82=D1=87=D0=B5=D0=BD=D0=BA=D0=BE/AI=20Tools=20Lab=20/SRR/Eng?=
 =?utf8?q?ineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <p.iliutchenk@samsung.com>
Date: Fri, 20 Sep 2019 04:51:15 +0300
Subject: [PATCH] [locomotiv] Implement MatrixEncode, MatrixDecode and MatMul
 (#7604)

* Implemented operations on loco interpreter (locomotiv)
* Added tests for this operations

Signed-off-by: Pavel Iliutchenko <p.iliutchenk@samsung.com>
---
 compiler/locomotiv/src/Node.lst                  |   3 +
 compiler/locomotiv/src/Node/MatMul.cpp           | 133 +++++++++++++++
 compiler/locomotiv/src/Node/MatMul.test.cpp      | 188 ++++++++++++++++++++
 compiler/locomotiv/src/Node/MatrixCodec.test.cpp | 207 +++++++++++++++++++++++
 compiler/locomotiv/src/Node/MatrixDecode.cpp     | 109 ++++++++++++
 compiler/locomotiv/src/Node/MatrixEncode.cpp     | 112 ++++++++++++
 6 files changed, 752 insertions(+)
 create mode 100644 compiler/locomotiv/src/Node/MatMul.cpp
 create mode 100644 compiler/locomotiv/src/Node/MatMul.test.cpp
 create mode 100644 compiler/locomotiv/src/Node/MatrixCodec.test.cpp
 create mode 100644 compiler/locomotiv/src/Node/MatrixDecode.cpp
 create mode 100644 compiler/locomotiv/src/Node/MatrixEncode.cpp
diff --git a/compiler/locomotiv/src/Node.lst b/compiler/locomotiv/src/Node.lst
index 7615c79..3427a70 100644
--- a/compiler/locomotiv/src/Node.lst
+++ b/compiler/locomotiv/src/Node.lst
@@ -21,6 +21,9 @@ NODE(FeatureDecode)
 NODE(FeatureEncode)
 NODE(FilterEncode)
 NODE(Forward)
+NODE(MatrixDecode)
+NODE(MatrixEncode)
+NODE(MatMul)
 NODE(MaxPool2D)
 NODE(Pull)
 NODE(Push)
diff --git a/compiler/locomotiv/src/Node/MatMul.cpp b/compiler/locomotiv/src/Node/MatMul.cpp
new file mode 100644
index 0000000..77b7315
--- /dev/null
+++ b/compiler/locomotiv/src/Node/MatMul.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeExecution.h"
+
+#include "NodeDataImpl.h"
+#include "NodeDomain.h"
+#include "Validation.h"
+
+#include <nncc/core/ADT/tensor/Shape.h>
+#include <nncc/core/ADT/tensor/Buffer.h>
+#include <nncc/core/ADT/tensor/Index.h>
+#include <nncc/core/ADT/tensor/IndexEnumerator.h>
+#include <nncc/core/ADT/tensor/LexicalLayout.h>
+
+#include <cassert>
+#include <stdexcept>
+
+namespace
+{
+using nncc::core::ADT::tensor::Buffer;
+using nncc::core::ADT::tensor::Shape;
+using nncc::core::ADT::tensor::Index;
+using nncc::core::ADT::tensor::LexicalLayout;
+using nncc::core::ADT::tensor::make_buffer;
+
+/**
+ * @brief Calculate Matrix Multiplication
+ */
+template <typename T> Buffer<T> calc_mat_mul(const Buffer<T> *lhs_buf, const Buffer<T> *rhs_buf)
+{
+  const auto lhs_shape = lhs_buf->shape();
+  const auto rhs_shape = rhs_buf->shape();
+
+  assert(lhs_shape.rank() == 2 && "lhs rank must be 2");
+  assert(rhs_shape.rank() == 2 && "rhs rank must be 2");
+  // lhs width should be the same as rhs height
+  assert(lhs_shape.dim(1) == rhs_shape.dim(0) && "height/width mismatch");
+
+  const uint32_t lhs_height = lhs_shape.dim(0);
+  const uint32_t lhs_width = lhs_shape.dim(1);
+
+  const uint32_t rhs_width = rhs_shape.dim(1);
+
+  const uint32_t output_height = lhs_height;
+  const uint32_t output_width = rhs_width;
+
+  Shape output_shape{output_height, output_width};
+  auto output_buf = make_buffer<T, LexicalLayout>(output_shape);
+
+  for (uint32_t out_y = 0; out_y < output_height; ++out_y)
+  {
+    for (uint32_t out_x = 0; out_x < output_width; ++out_x)
+    {
+      T total = static_cast<T>(0); // accumulator
+      // Accumulate through axis
+      for (uint32_t axis = 0; axis < lhs_width; ++axis)
+      {
+        total += lhs_buf->at(Index({out_y, axis})) * rhs_buf->at(Index({axis, out_x}));
+      }
+      // Set output value
+      output_buf.at(Index({out_y, out_x})) = total;
+    }
+  }
+
+  return output_buf;
+}
+
+} // namespace
+
+namespace locomotiv
+{
+
+void NodeExecution::execute(loco::MatMul *mat_mul)
+{
+  auto lhs_data = annot_data(mat_mul->lhs());
+  auto rhs_data = annot_data(mat_mul->rhs());
+
+  validate(lhs_data, "Can't find left matrix data of MatMul");
+  validate(lhs_data->shape()->rank() == 2, "lhs rank must be 2");
+
+  validate(rhs_data, "Can't find right matrix data of MatMul");
+  validate(rhs_data->shape()->rank() == 2, "rhs rank must be 2");
+
+  validate(annot_domain(mat_mul->lhs()) == loco::Domain::Matrix,
+           "Left matrix of MatMul is not a Matrix");
+  validate(annot_domain(mat_mul->rhs()) == loco::Domain::Matrix,
+           "Right matrix of MatMul is not a Matrix");
+
+  std::unique_ptr<NodeData> mat_mul_result = nullptr;
+
+  if (lhs_data->dtype() == loco::DataType::FLOAT32 && rhs_data->dtype() == loco::DataType::FLOAT32)
+  {
+    const auto lhs_buf = lhs_data->as_f32_bufptr();
+    const auto rhs_buf = rhs_data->as_f32_bufptr();
+
+    auto mat_mul_buf = calc_mat_mul<float>(lhs_buf, rhs_buf);
+
+    mat_mul_result = make_data(mat_mul_buf);
+  }
+  else if (lhs_data->dtype() == loco::DataType::S32 && rhs_data->dtype() == loco::DataType::S32)
+  {
+    const auto lhs_buf = lhs_data->as_s32_bufptr();
+    const auto rhs_buf = rhs_data->as_s32_bufptr();
+
+    auto mat_mul_buf = calc_mat_mul<int32_t>(lhs_buf, rhs_buf);
+
+    mat_mul_result = make_data(mat_mul_buf);
+  }
+  else
+    throw std::runtime_error("NYI for these DataTypes");
+
+  assert(mat_mul_result != nullptr);
+
+  annot_data(mat_mul, std::move(mat_mul_result));
+  annot_domain(mat_mul, loco::Domain::Matrix);
+}
+
+} // namespace locomotiv
diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
new file mode 100644
index 0000000..bd480f7
--- /dev/null
+++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeExecution.h"
+
+#include "locomotiv/NodeData.h"
+#include "NodeDataImpl.h"
+#include "NodeDomain.h"
+
+#include <nncc/core/ADT/tensor/Shape.h>
+#include <nncc/core/ADT/tensor/Buffer.h>
+#include <nncc/core/ADT/tensor/Overlay.h>
+#include <nncc/core/ADT/tensor/LexicalLayout.h>
+#include "nncc/core/ADT/tensor/IndexEnumerator.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+using nncc::core::ADT::tensor::Shape;
+using nncc::core::ADT::tensor::LexicalLayout;
+using nncc::core::ADT::tensor::make_buffer;
+using nncc::core::ADT::tensor::make_overlay;
+
+template <typename T>
+void run_test(const T *lhs, const T *rhs, const T *expected_output, const Shape &lhs_shape,
+              const Shape &rhs_shape, const Shape &out_shape, loco::DataType expected_datatype)
+{
+  auto g = loco::make_graph();
+  // Fill lhs MatrixEncode
+  auto lhs_enc = g->nodes()->create<loco::MatrixEncode>();
+  {
+    auto lhs_enc_buf = make_buffer<T, LexicalLayout>(lhs_shape);
+    auto lhs_overlay = make_overlay<T, LexicalLayout>(lhs_shape, const_cast<T *>(lhs));
+    for (nncc::core::ADT::tensor::IndexEnumerator e{lhs_shape}; e.valid(); e.advance())
+    {
+      const auto &ind = e.current();
+      lhs_enc_buf.at(ind) = lhs_overlay.at(ind);
+    }
+
+    auto enc_data = locomotiv::make_data(lhs_enc_buf);
+    locomotiv::annot_data(lhs_enc, std::move(enc_data));
+    locomotiv::annot_domain(lhs_enc, loco::Domain::Matrix);
+  }
+  // Fill rhs MatrixEncode
+  auto rhs_enc = g->nodes()->create<loco::MatrixEncode>();
+  {
+    auto rhs_enc_buf = make_buffer<T, LexicalLayout>(rhs_shape);
+    auto rhs_overlay = make_overlay<T, LexicalLayout>(rhs_shape, const_cast<T *>(rhs));
+    for (nncc::core::ADT::tensor::IndexEnumerator e{rhs_shape}; e.valid(); e.advance())
+    {
+      const auto &ind = e.current();
+      rhs_enc_buf.at(ind) = rhs_overlay.at(ind);
+    }
+
+    auto enc_data = locomotiv::make_data(rhs_enc_buf);
+    locomotiv::annot_data(rhs_enc, std::move(enc_data));
+    locomotiv::annot_domain(rhs_enc, loco::Domain::Matrix);
+  }
+
+  // build MatMul
+  auto mat_mul = g->nodes()->create<loco::MatMul>();
+  mat_mul->lhs(lhs_enc);
+  mat_mul->rhs(rhs_enc);
+
+  // run interpreter
+  locomotiv::NodeExecution::get().run(mat_mul);
+
+  // get result of calculation
+  auto mat_mul_result = locomotiv::annot_data(mat_mul);
+
+  // check the result
+  ASSERT_NE(mat_mul_result, nullptr);
+  ASSERT_TRUE(mat_mul_result->dtype() == expected_datatype);
+  ASSERT_TRUE(*(mat_mul_result->shape()) == out_shape);
+
+  auto out_overlay = make_overlay<T, LexicalLayout>(out_shape, const_cast<T *>(expected_output));
+  for (nncc::core::ADT::tensor::IndexEnumerator e{out_shape}; e.valid(); e.advance())
+  {
+    const auto &ind = e.current();
+    if (expected_datatype == loco::DataType::FLOAT32)
+      ASSERT_FLOAT_EQ(mat_mul_result->as_f32_bufptr()->at(ind), out_overlay.at(ind));
+    else if (expected_datatype == loco::DataType::S32)
+      ASSERT_EQ(mat_mul_result->as_s32_bufptr()->at(ind), out_overlay.at(ind));
+    else
+      throw std::runtime_error("NYI for these DataTypes");
+  }
+
+  ASSERT_EQ(locomotiv::annot_domain(mat_mul), loco::Domain::Matrix);
+}
+
+} // namespace
+
+// clang-format off
+/* from the code below:
+
+import numpy as np
+
+a = [[-0.48850584,  1.4292705,  -1.3424522],
+     [1.7021934,  -0.39246717,  0.6248314]]
+
+b = [[-0.0830195,  0.21088193, -0.11781317],
+     [0.07755677, 1.6337638,   1.0792778],
+     [-1.6922939, -1.5437212,   0.96667504]]
+
+print(np.array(a) @ np.array(b))
+*/
+TEST(NodeExecution_MatMul, f32_2x3_3x3)
+{
+  using nncc::core::ADT::tensor::Shape;
+
+  const float lhs[] =
+  {
+    -0.48850584,  1.4292705,  -1.3424522,
+     1.7021934,  -0.39246717,  0.6248314
+  };
+
+  const float rhs[] =
+  {
+    -0.0830195,  0.21088193, -0.11781317,
+     0.07755677, 1.6337638,   1.0792778,
+    -1.6922939, -1.5437212,   0.96667504
+  };
+
+  const float out[] =
+  {
+    2.42322878,  4.30444527,  0.30241731,
+    -1.2291521,  -1.2468023,  -0.02011299
+  };
+
+  run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
+}
+
+/* from the code below:
+
+import numpy as np
+
+a = np.random.randint(10000, size=(4, 2))
+
+b = np.random.randint(10000, size=(2, 6))
+
+print(a)
+print(b)
+print(np.array(a) @ np.array(b))
+*/
+TEST(NodeExecution_MatMul, s32_4x2_2x6)
+{
+  using nncc::core::ADT::tensor::Shape;
+
+  const int32_t lhs[] =
+  {
+    6392, 4993,
+      54, 9037,
+    3947, 5820,
+    5800, 4181
+  };
+
+  const int32_t rhs[] =
+  {
+    2694, 8376, 8090, 1285, 7492, 1652,
+    5427, 8798, 7634, 2229, 5439, 6999
+  };
+
+  const int32_t out[] =
+  {
+    44317059, 97467806, 89827842, 19343117, 75045791, 45505591,
+    49189275, 79959830, 69425318, 20212863, 49556811, 63339171,
+    42218358, 84264432, 76361110, 18044675, 61225904, 47254624,
+    38315487, 85365238, 78839754, 16772449, 66194059, 38844419
+  };
+
+  run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
+}
+
+// clang-format on
diff --git a/compiler/locomotiv/src/Node/MatrixCodec.test.cpp b/compiler/locomotiv/src/Node/MatrixCodec.test.cpp
new file mode 100644
index 0000000..8fc5d59
--- /dev/null
+++ b/compiler/locomotiv/src/Node/MatrixCodec.test.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeExecution.h"
+
+#include "locomotiv/NodeData.h"
+#include "NodeDataImpl.h"
+#include "NodeDomain.h"
+
+#include <loco/IR/PermutingCodec.h>
+
+#include <nncc/core/ADT/tensor/Shape.h>
+#include <nncc/core/ADT/tensor/Buffer.h>
+#include <nncc/core/ADT/tensor/LexicalLayout.h>
+#include <nncc/core/ADT/tensor/IndexEnumerator.h>
+
+#include <gtest/gtest.h>
+
+using nncc::core::ADT::tensor::Index;
+using nncc::core::ADT::tensor::Shape;
+using nncc::core::ADT::tensor::LexicalLayout;
+using nncc::core::ADT::tensor::make_buffer;
+using nncc::core::ADT::tensor::IndexEnumerator;
+using nncc::core::ADT::tensor::Buffer;
+
+// This file is intended to test MatrixEncode and MatrixDecode at once
+namespace
+{
+
+class NodeExecution_MatrixCodec : public ::testing::Test
+{
+private:
+  loco::Graph g;
+
+protected:
+  /// @brief Make Pull node and set data by given buffer and data type
+  template <typename DT> loco::Pull *pull_layer(Buffer<DT> &pull_buf, loco::DataType dtype)
+  {
+    auto pull = g.nodes()->create<loco::Pull>();
+    pull->dtype(dtype);
+
+    auto pull_data = locomotiv::make_data(pull_buf);
+    locomotiv::annot_data(pull, std::move(pull_data));
+    locomotiv::annot_domain(pull, loco::Domain::Tensor);
+
+    return pull;
+  }
+
+  /// @brief Make MatrixEncode node with given input and encoding permutation
+  loco::MatrixEncode *matrix_encode_layer(loco::Node *input,
+                                          const loco::Permutation<loco::Domain::Matrix> &perm)
+  {
+    auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::Matrix>>(
+        new loco::PermutingEncoder<loco::Domain::Matrix>);
+
+    encoder->perm(perm);
+
+    auto enc = g.nodes()->create<loco::MatrixEncode>();
+    enc->input(input);
+    enc->encoder(std::move(encoder));
+
+    return enc;
+  }
+
+  /// @brief Make MatrixDecode node with given input and decoding permutation
+  loco::MatrixDecode *matrix_decode_layer(loco::Node *input,
+                                          const loco::Permutation<loco::Domain::Matrix> &perm)
+  {
+    auto decoder = std::unique_ptr<loco::PermutingDecoder<loco::Domain::Matrix>>(
+        new loco::PermutingDecoder<loco::Domain::Matrix>);
+
+    decoder->perm(perm);
+
+    auto dec = g.nodes()->create<loco::MatrixDecode>();
+    dec->input(input);
+    dec->decoder(std::move(decoder));
+
+    return dec;
+  }
+};
+
+} // namespace
+
+TEST_F(NodeExecution_MatrixCodec, HW_s32)
+{
+  const uint32_t H = 3;
+  const uint32_t W = 4;
+
+  // Make HW data for pull node
+  auto pull_buf = make_buffer<int32_t, LexicalLayout>(Shape{H, W});
+  int32_t i = 0;
+  for (IndexEnumerator e{pull_buf.shape()}; e.valid(); e.advance())
+  {
+    pull_buf.at(e.current()) = i;
+    ++i; // Doesn't matter what it is
+  }
+
+  // Make HW permutation for encoder and decoder
+  loco::Permutation<loco::Domain::Matrix> HW;
+
+  HW.axis(loco::MatrixAxis::Height) = 0;
+  HW.axis(loco::MatrixAxis::Width) = 1;
+
+  // Pull
+  auto pull = pull_layer(pull_buf, loco::DataType::S32);
+
+  // MatrixEncode
+  auto enc = matrix_encode_layer(pull, HW);
+  locomotiv::NodeExecution::get().run(enc);
+
+  // Test MatrixEncode
+  auto enc_data = locomotiv::annot_data(enc);
+  ASSERT_NE(enc_data, nullptr);
+  ASSERT_EQ(enc_data->dtype(), loco::DataType::S32);
+  ASSERT_EQ(*(enc_data->shape()), (Shape{H, W})); // locomotiv matrix is HW
+  auto enc_buf = enc_data->as_s32_bufptr();
+  for (uint32_t h = 0; h < H; ++h)
+    for (uint32_t w = 0; w < W; ++w)
+      ASSERT_EQ(pull_buf.at(Index{h, w}), enc_buf->at(Index{h, w}));
+
+  ASSERT_EQ(locomotiv::annot_domain(enc), loco::Domain::Matrix);
+
+  // MatrixDecode
+  auto dec = matrix_decode_layer(enc, HW);
+  locomotiv::NodeExecution::get().run(dec);
+
+  // Test MatrixDecode: Encode -> Decode == identity
+  auto dec_data = locomotiv::annot_data(dec);
+  ASSERT_NE(dec_data, nullptr);
+  ASSERT_EQ(dec_data->dtype(), loco::DataType::S32);
+  ASSERT_EQ(*(dec_data->shape()), (Shape{H, W}));
+  auto dec_buf = dec_data->as_s32_bufptr();
+  for (uint32_t h = 0; h < H; ++h)
+    for (uint32_t w = 0; w < W; ++w)
+      ASSERT_EQ(pull_buf.at(Index{h, w}), dec_buf->at(Index{h, w}));
+
+  ASSERT_EQ(locomotiv::annot_domain(dec), loco::Domain::Tensor);
+}
+
+TEST_F(NodeExecution_MatrixCodec, WH_f32)
+{
+  const uint32_t W = 6;
+  const uint32_t H = 5;
+
+  // Make crazy WH data for pull node
+  auto pull_buf = make_buffer<float, LexicalLayout>(Shape{W, H});
+  float f = 0.0f;
+  for (IndexEnumerator e{pull_buf.shape()}; e.valid(); e.advance())
+  {
+    pull_buf.at(e.current()) = f;
+    f += 0.1f; // Doesn't matter what it is
+  }
+
+  // Make WH permutation for encoder and decoder
+  loco::Permutation<loco::Domain::Matrix> WH;
+
+  WH.axis(loco::MatrixAxis::Width) = 0;
+  WH.axis(loco::MatrixAxis::Height) = 1;
+
+  // Pull
+  auto pull = pull_layer(pull_buf, loco::DataType::FLOAT32);
+
+  // MatrixEncode
+  auto enc = matrix_encode_layer(pull, WH);
+  locomotiv::NodeExecution::get().run(enc);
+
+  // Test MatrixEncode
+  auto enc_data = locomotiv::annot_data(enc);
+  ASSERT_NE(enc_data, nullptr);
+  ASSERT_EQ(enc_data->dtype(), loco::DataType::FLOAT32);
+  ASSERT_EQ(*(enc_data->shape()), (Shape{H, W})); // locomotiv matrix is HW
+  auto enc_buf = enc_data->as_f32_bufptr();
+  for (uint32_t h = 0; h < H; ++h)
+    for (uint32_t w = 0; w < W; ++w)
+      ASSERT_FLOAT_EQ(pull_buf.at(Index{w, h}), enc_buf->at(Index{h, w}));
+
+  ASSERT_EQ(locomotiv::annot_domain(enc), loco::Domain::Matrix);
+
+  // MatrixDecode
+  auto dec = matrix_decode_layer(enc, WH);
+  locomotiv::NodeExecution::get().run(dec);
+
+  // Test MatrixDecode: Encode -> Decode == identity
+  auto dec_data = locomotiv::annot_data(dec);
+  ASSERT_NE(dec_data, nullptr);
+  ASSERT_EQ(dec_data->dtype(), loco::DataType::FLOAT32);
+  ASSERT_EQ(*(dec_data->shape()), (Shape{W, H}));
+  auto dec_buf = dec_data->as_f32_bufptr();
+  for (uint32_t h = 0; h < H; ++h)
+    for (uint32_t w = 0; w < W; ++w)
+      ASSERT_FLOAT_EQ(pull_buf.at(Index{w, h}), dec_buf->at(Index{w, h}));
+
+  ASSERT_EQ(locomotiv::annot_domain(dec), loco::Domain::Tensor);
+}
diff --git a/compiler/locomotiv/src/Node/MatrixDecode.cpp b/compiler/locomotiv/src/Node/MatrixDecode.cpp
new file mode 100644
index 0000000..c591676
--- /dev/null
+++ b/compiler/locomotiv/src/Node/MatrixDecode.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeExecution.h"
+
+#include "NodeDataImpl.h"
+#include "NodeDomain.h"
+#include "Validation.h"
+
+#include <nncc/core/ADT/tensor/LexicalLayout.h>
+#include <nncc/core/ADT/tensor/IndexEnumerator.h>
+
+#include <stdexcept>
+#include <cassert>
+
+namespace
+{
+
+using nncc::core::ADT::tensor::Buffer;
+using nncc::core::ADT::tensor::make_buffer;
+using nncc::core::ADT::tensor::LexicalLayout;
+using nncc::core::ADT::tensor::Shape;
+using nncc::core::ADT::tensor::IndexEnumerator;
+using nncc::core::ADT::tensor::Index;
+
+template <typename T>
+std::unique_ptr<locomotiv::NodeData> matrix_decode(const loco::MatrixDecode *node,
+                                                   const Buffer<T> *input_buf)
+{
+  auto decoder = node->decoder();
+
+  // Make MatrixShape from input. Note that matrix in locomotiv represented as HW
+  loco::MatrixShape input_shape;
+  assert(input_buf->shape().rank() == 2);
+  input_shape.height() = input_buf->shape().dim(0);
+  input_shape.width() = input_buf->shape().dim(1);
+
+  loco::TensorShape node_shape = decoder->shape(input_shape);
+
+  // Make tensor buffer from TensorShape
+  Buffer<T> node_buf =
+      make_buffer<T, LexicalLayout>(Shape{node_shape.dim(0).value(), node_shape.dim(1).value()});
+
+  // Copy buffer in an order arranged by decoder
+  for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
+  {
+    loco::MatrixIndex matrix_index = decoder->value(e.current());
+    Index buf_index({matrix_index.row(), matrix_index.column()});
+
+    node_buf.at(e.current()) = input_buf->at(buf_index);
+  }
+
+  return locomotiv::make_data(node_buf);
+}
+
+} // namespace
+
+namespace locomotiv
+{
+
+void NodeExecution::execute(loco::MatrixDecode *matrix_dec)
+{
+  auto input_data = annot_data(matrix_dec->input());
+
+  validate(input_data, "Input not ready");
+  validate(annot_domain(matrix_dec->input()) == loco::Domain::Matrix,
+           "Input domain should be Matrix");
+  validate(input_data->shape()->rank() == 2, "Input data rank must be 2");
+
+  std::unique_ptr<NodeData> matrix_dec_data = nullptr;
+
+  switch (input_data->dtype())
+  {
+    case loco::DataType::S32:
+    {
+      auto input_buf = input_data->as_s32_bufptr();
+      matrix_dec_data = matrix_decode<int32_t>(matrix_dec, input_buf);
+      break;
+    }
+    case loco::DataType::FLOAT32:
+    {
+      auto input_buf = input_data->as_f32_bufptr();
+      matrix_dec_data = matrix_decode<float>(matrix_dec, input_buf);
+      break;
+    }
+    default:
+      throw std::runtime_error("NYI for this DataType");
+  }
+
+  assert(matrix_dec_data != nullptr);
+
+  annot_data(matrix_dec, std::move(matrix_dec_data));
+  annot_domain(matrix_dec, loco::Domain::Tensor);
+}
+
+} // namespace locomotiv
diff --git a/compiler/locomotiv/src/Node/MatrixEncode.cpp b/compiler/locomotiv/src/Node/MatrixEncode.cpp
new file mode 100644
index 0000000..e3554e1
--- /dev/null
+++ b/compiler/locomotiv/src/Node/MatrixEncode.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeExecution.h"
+
+#include "NodeDataImpl.h"
+#include "NodeDomain.h"
+#include "Validation.h"
+
+#include <nncc/core/ADT/tensor/LexicalLayout.h>
+#include <nncc/core/ADT/tensor/IndexEnumerator.h>
+
+#include <stdexcept>
+#include <cassert>
+
+namespace
+{
+
+using nncc::core::ADT::tensor::Buffer;
+using nncc::core::ADT::tensor::make_buffer;
+using nncc::core::ADT::tensor::LexicalLayout;
+using nncc::core::ADT::tensor::Shape;
+using nncc::core::ADT::tensor::IndexEnumerator;
+
+template <typename T>
+std::unique_ptr<locomotiv::NodeData> matrix_encode(const loco::MatrixEncode *node,
+                                                   const Buffer<T> *input_buf)
+{
+  auto encoder = node->encoder();
+
+  // Make TensorShape from input
+  loco::TensorShape input_shape;
+  input_shape.rank(input_buf->shape().rank());
+  assert(input_shape.rank() == 2);
+  for (uint32_t i = 0; i < input_shape.rank(); ++i)
+  {
+    input_shape.dim(i) = input_buf->shape().dim(i);
+  }
+
+  loco::MatrixShape node_shape = encoder->shape(input_shape);
+
+  // Make HW buffer from MatrixShape
+  Buffer<T> node_buf =
+      make_buffer<T, LexicalLayout>(Shape{node_shape.height().value(), node_shape.width().value()});
+
+  // Copy buffer in an order arranged by encoder
+  for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
+  {
+    loco::MatrixIndex index;
+    index.row() = e.current().at(0);
+    index.column() = e.current().at(1);
+
+    node_buf.at(e.current()) = input_buf->at(encoder->value(index));
+  }
+
+  return locomotiv::make_data(node_buf);
+}
+
+} // namespace
+
+namespace locomotiv
+{
+
+void NodeExecution::execute(loco::MatrixEncode *matrix_enc)
+{
+  auto input_data = annot_data(matrix_enc->input());
+
+  validate(input_data, "Input not ready");
+  validate(annot_domain(matrix_enc->input()) == loco::Domain::Tensor,
+           "Input domain should be Tensor");
+  validate(input_data->shape()->rank() == 2, "Input data rank must be 2");
+
+  std::unique_ptr<NodeData> matrix_enc_data = nullptr;
+
+  switch (input_data->dtype())
+  {
+    case loco::DataType::S32:
+    {
+      auto input_buf = input_data->as_s32_bufptr();
+      matrix_enc_data = matrix_encode<int32_t>(matrix_enc, input_buf);
+      break;
+    }
+    case loco::DataType::FLOAT32:
+    {
+      auto input_buf = input_data->as_f32_bufptr();
+      matrix_enc_data = matrix_encode<float>(matrix_enc, input_buf);
+      break;
+    }
+    default:
+      throw std::runtime_error("NYI for this DataType");
+  }
+
+  assert(matrix_enc_data != nullptr);
+
+  annot_data(matrix_enc, std::move(matrix_enc_data));
+  annot_domain(matrix_enc, loco::Domain::Matrix);
+}
+
+} // namespace locomotiv
-- 
2.7.4