[ trivial ] Add doxygen tags in matrix transpose functions
authorskykongkong8 <ss.kong@samsung.com>
Thu, 23 May 2024 04:34:23 +0000 (13:34 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Tue, 4 Jun 2024 09:55:20 +0000 (18:55 +0900)
- add doxygen tags to avoid CI fail
- trivial formatting

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
nntrainer/tensor/blas_interface.cpp
nntrainer/tensor/matrix_transpose_neon/matrix_transpose_kernels_neon.h
nntrainer/tensor/tensor.cpp

index ac2085eaaf02e015a3e7cf550bb91031ac55f4ed..e04c1ce499d6ec78f1c935396342924ba65ee44f 100644 (file)
 namespace nntrainer {
 
 template <typename T>
-static inline void transpose_fallback(
-    unsigned int M,
-    unsigned int N,
-    const T* src,
-    unsigned int ld_src,
-    T* dst,
-    unsigned int ld_dst) {
-  for (unsigned int j = 0; j < N; j++) {
-    for (unsigned int i = 0; i < M; i++) {
+static inline void transpose_fallback(unsigned int M, unsigned int N,
+                                      const T *src, unsigned int ld_src, T *dst,
+                                      unsigned int ld_dst) {
+  for (unsigned int i = 0; i < M; i++) {
+    for (unsigned int j = 0; j < N; j++) {
       dst[i + j * ld_dst] = src[i * ld_src + j];
     }
   }
index b8f7f2d99ac32720aa27231461616cf11e4326e3..fd8290eca39399be6bad2cea58fb0f898461be75 100644 (file)
 #include <cstdint>
 #include <mask_neon.h>
 
-#define TRANSPOSE_FP16_4x4(row0, row1, row2, row3)                             \
-  float16x4x2_t row01 = vtrn_f16(row0, row1);                                  \
-  float16x4x2_t row23 = vtrn_f16(row2, row3);                                  \
-  row0 = vcvt_f16_f32(vcombine_f32(vget_low_f32(vcvt_f32_f16(row01.val[0])),   \
-                                   vget_low_f32(vcvt_f32_f16(row23.val[0])))); \
-  row1 = vcvt_f16_f32(vcombine_f32(vget_low_f32(vcvt_f32_f16(row01.val[1])),   \
-                                   vget_low_f32(vcvt_f32_f16(row23.val[1])))); \
-  row2 =                                                                       \
-    vcvt_f16_f32(vcombine_f32(vget_high_f32(vcvt_f32_f16(row01.val[0])),       \
-                              vget_high_f32(vcvt_f32_f16(row23.val[0]))));     \
-  row3 =                                                                       \
-    vcvt_f16_f32(vcombine_f32(vget_high_f32(vcvt_f32_f16(row01.val[1])),       \
-                              vget_high_f32(vcvt_f32_f16(row23.val[1]))));
-
+#define TRANSPOSE_FP16_4x4(row0, row1, row2, row3)                           \
+  do {                                                                       \
+    float16x4x2_t row01 = vtrn_f16(row0, row1);                              \
+    float16x4x2_t row23 = vtrn_f16(row2, row3);                              \
+    row0 =                                                                   \
+      vcvt_f16_f32(vcombine_f32(vget_low_f32(vcvt_f32_f16(row01.val[0])),    \
+                                vget_low_f32(vcvt_f32_f16(row23.val[0]))));  \
+    row1 =                                                                   \
+      vcvt_f16_f32(vcombine_f32(vget_low_f32(vcvt_f32_f16(row01.val[1])),    \
+                                vget_low_f32(vcvt_f32_f16(row23.val[1]))));  \
+    row2 =                                                                   \
+      vcvt_f16_f32(vcombine_f32(vget_high_f32(vcvt_f32_f16(row01.val[0])),   \
+                                vget_high_f32(vcvt_f32_f16(row23.val[0])))); \
+    row3 =                                                                   \
+      vcvt_f16_f32(vcombine_f32(vget_high_f32(vcvt_f32_f16(row01.val[1])),   \
+                                vget_high_f32(vcvt_f32_f16(row23.val[1])))); \
+  } while (0)
+/**
+ * @brief 4x4 sized kernel for matrix transpose in NEON
+ *
+ * @param src __fp16* source data
+ * @param ld_src col length of src
+ * @param dst __fp16* destination data
+ * @param ld_dst col length of dst
+ */
 static inline void transpose_kernel_4x4_neon(const __fp16 *src,
                                              unsigned int ld_src, __fp16 *dst,
                                              unsigned int ld_dst) {
@@ -46,6 +57,17 @@ static inline void transpose_kernel_4x4_neon(const __fp16 *src,
   vst1_f16(&dst[3 * ld_dst], d);
 }
 
+/**
+ * @brief general case mxn sized matrix transpose kernel with 128 bit SIMD
+ * register
+ *
+ * @tparam M leftover size for row direction
+ * @param N leftover size for col direction
+ * @param src __fp16* source data
+ * @param ld_src col length of src
+ * @param dst __fp16* destination data
+ * @param ld_dst col length of dst
+ */
 template <unsigned int M>
 static void transpose_kernel_mxn_neon_128(unsigned int N, const __fp16 *src,
                                           unsigned int ld_src, __fp16 *dst,
@@ -88,7 +110,14 @@ static void transpose_kernel_mxn_neon_128(unsigned int N, const __fp16 *src,
              vbsl_f16(bitmask_v8, input[i], vld1_f16(&dst[i * ld_dst])));
   }
 }
-
+/**
+ * @brief 8x8 sized kernel for matrix transpose in NEON
+ *
+ * @param src __fp16* source data
+ * @param ld_src col length of src
+ * @param dst __fp16* destination data
+ * @param ld_dst col length of dst
+ */
 static inline void transpose_kernel_8x8_neon(const __fp16 *src,
                                              unsigned int ld_src, __fp16 *dst,
                                              unsigned int ld_dst) {
@@ -153,7 +182,17 @@ static inline void transpose_kernel_8x8_neon(const __fp16 *src,
   vst1q_f16(&dst[6 * ld_dst], g);
   vst1q_f16(&dst[7 * ld_dst], h);
 }
-
+/**
+ * @brief general case mxn sized matrix transpose kernel with 256 bit SIMD
+ * register
+ *
+ * @tparam M leftover size for row direction
+ * @param N leftover size for col direction
+ * @param src __fp16* source data
+ * @param ld_src col length of src
+ * @param dst __fp16* destination data
+ * @param ld_dst col length of dst
+ */
 template <unsigned int M>
 static void transpose_kernel_mxn_neon_256(unsigned int N, const __fp16 *src,
                                           unsigned int ld_src, __fp16 *dst,
index b379857411754b2160215c68827ed6c77b66eef9..5dc3c93f01fe5abaa811712ff1bd4f7acfa63b3f 100644 (file)
@@ -2403,10 +2403,10 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
         if (is_format_nchw) {
           for (unsigned int b = 0; b < batch(); ++b) {
             for (unsigned int c = 0; c < channel(); ++c) {
-              transpose_matrix(
-                height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
-                width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
-                out.width());
+              transpose_matrix(height(), width(),
+                               getData<_FP16>() + getIndex(b, c, 0, 0), width(),
+                               out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
+                               out.width());
             }
           }
         } else {