const _FP16 *src, unsigned int ld_src, _FP16 *dst,
unsigned int ld_dst) {
#ifdef USE_NEON
+/// @note Final form of transpose_neon is NOT having fallback. Debugging WIP.
if ((M & 0x3) == 0) {
transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
} else {
subdir('hgemm')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
+
+ subdir('matrix_transpose_neon')
+ nntrainer_inc += include_directories('matrix_transpose_neon')
+ nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
unsigned int SL, SI, SJ, SK;
out.reshape(dim.transpose(direction));
-
int indexI = direction[0] - '0';
int indexJ = direction[2] - '0';
}
} else {
if (is_format_nchw) {
- transposeloop(l, i, k, j, SL, SI, SK, SJ);
+ for (unsigned int b = 0; b < batch(); ++b) {
+ for (unsigned int c = 0; c < channel(); ++c) {
+ transpose_matrix(
+ height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
+ width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
+ out.width());
+ }
+ }
} else {
transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
}