[ Tensor ] Use SIMD accelerated transpose if possible
authorskykongkong8 <ss.kong@samsung.com>
Thu, 9 May 2024 23:52:35 +0000 (08:52 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Tue, 4 Jun 2024 09:55:20 +0000 (18:55 +0900)
- If it is for height-width transpose, we can enjoy SIMD accelerated code.
- Use SIMD version if possible, otherwise fallback.
- Through this commit, followings are expected to be accelerated, or can be accelerated with ease in the near future:
  - "0:2:1" transpose
  - BiQHGEMM
  - HGEMM

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
nntrainer/tensor/blas_interface.cpp
nntrainer/tensor/meson.build
nntrainer/tensor/tensor.cpp

index 668c3bd4a5f3df6bd278e9bda7d0095d9f21c7cf..8c6adffed04bde8bb28bb667ff440f2ef4359467 100644 (file)
@@ -535,6 +535,7 @@ void transpose_matrix(const unsigned int M, const unsigned int N,
                       const _FP16 *src, unsigned int ld_src, _FP16 *dst,
                       unsigned int ld_dst) {
 #ifdef USE_NEON
+/// @note Final form of transpose_neon is NOT having fallback. Debugging WIP.
   if ((M & 0x3) == 0) {
     transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
   } else {
index 0884dbd3b44ad169d2f734015b902f273a6b6ec7..fe4204cf8523d5fd4be460b47ffb2a81ac58f6fa 100644 (file)
@@ -54,6 +54,10 @@ if get_option('enable-fp16')
       subdir('hgemm')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
+
+      subdir('matrix_transpose_neon')
+      nntrainer_inc += include_directories('matrix_transpose_neon')
+      nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
     endif
   elif get_option('enable-avx')
     tensor_sources += 'blas_avx.cpp'
index 4f1e8e072137f6a5b015c2b299a7f428a9e616f4..b379857411754b2160215c68827ed6c77b66eef9 100644 (file)
@@ -2330,7 +2330,6 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
   unsigned int SL, SI, SJ, SK;
 
   out.reshape(dim.transpose(direction));
-
   int indexI = direction[0] - '0';
   int indexJ = direction[2] - '0';
 
@@ -2402,7 +2401,14 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
         }
       } else {
         if (is_format_nchw) {
-          transposeloop(l, i, k, j, SL, SI, SK, SJ);
+          for (unsigned int b = 0; b < batch(); ++b) {
+            for (unsigned int c = 0; c < channel(); ++c) {
+              transpose_matrix(
+                height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
+                width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
+                out.width());
+            }
+          }
         } else {
           transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
         }