[ Tensor ] Use SIMD accelerated transpose if possible

author skykongkong8 <ss.kong@samsung.com>

Thu, 9 May 2024 23:52:35 +0000 (08:52 +0900)

committer Jijoong Moon <jijoong.moon@samsung.com>

Tue, 4 Jun 2024 09:55:20 +0000 (18:55 +0900)
author skykongkong8 <ss.kong@samsung.com>
Thu, 9 May 2024 23:52:35 +0000 (08:52 +0900)
committer Jijoong Moon <jijoong.moon@samsung.com>
Tue, 4 Jun 2024 09:55:20 +0000 (18:55 +0900)
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp

index 668c3bd4a5f3df6bd278e9bda7d0095d9f21c7cf..8c6adffed04bde8bb28bb667ff440f2ef4359467 100644 (file)
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -535,6 +535,7 @@ void transpose_matrix(const unsigned int M, const unsigned int N,
                        const _FP16 *src, unsigned int ld_src, _FP16 *dst,
                        unsigned int ld_dst) {
  #ifdef USE_NEON
+/// @note Final form of transpose_neon is NOT having fallback. Debugging WIP.
    if ((M & 0x3) == 0) {
      transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
    } else {
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build

index 0884dbd3b44ad169d2f734015b902f273a6b6ec7..fe4204cf8523d5fd4be460b47ffb2a81ac58f6fa 100644 (file)
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -54,6 +54,10 @@ if get_option('enable-fp16')
        subdir('hgemm')
        nntrainer_inc += include_directories('hgemm')
        nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
+
+      subdir('matrix_transpose_neon')
+      nntrainer_inc += include_directories('matrix_transpose_neon')
+      nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
      endif
    elif get_option('enable-avx')
      tensor_sources += 'blas_avx.cpp'
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp

index 4f1e8e072137f6a5b015c2b299a7f428a9e616f4..b379857411754b2160215c68827ed6c77b66eef9 100644 (file)
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -2330,7 +2330,6 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
    unsigned int SL, SI, SJ, SK;
  
    out.reshape(dim.transpose(direction));
-
    int indexI = direction[0] - '0';
    int indexJ = direction[2] - '0';
  
@@ -2402,7 +2401,14 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
          }
        } else {
          if (is_format_nchw) {
-          transposeloop(l, i, k, j, SL, SI, SK, SJ);
+          for (unsigned int b = 0; b < batch(); ++b) {
+            for (unsigned int c = 0; c < channel(); ++c) {
+              transpose_matrix(
+                height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
+                width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
+                out.width());
+            }
+          }
          } else {
            transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
          }
author	skykongkong8 <ss.kong@samsung.com>
	Thu, 9 May 2024 23:52:35 +0000 (08:52 +0900)
committer	Jijoong Moon <jijoong.moon@samsung.com>
	Tue, 4 Jun 2024 09:55:20 +0000 (18:55 +0900)
nntrainer/tensor/blas_interface.cpp		patch \| blob \| history
nntrainer/tensor/meson.build		patch \| blob \| history
nntrainer/tensor/tensor.cpp		patch \| blob \| history