From fb60165ac3ad3df5f3b6834e87dbc31a9b356afd Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=98=A4=ED=98=95=EC=84=9D/On-Device=20Lab=28SR=29/Staff?= =?utf8?q?=20Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Tue, 29 Oct 2019 12:51:21 +0900 Subject: [PATCH] [nnfw-ncnn] Restructure directory and introduce mat class (#8532) Restructure ncnn directory - Introduce internal srcn directory to divide with original ncnn directory - Introduce mat.cpp and mat.h - Update copyright Signed-off-by: Hyeongseok Oh --- compute/ncnn/README.md | 9 + compute/ncnn/include/ncnn/mat.h | 738 ++++++++++++++++ compute/ncnn/include/{ => ncnn}/srcn/conv_type.h | 0 compute/ncnn/include/{ => ncnn}/srcn/srcn_conv.h | 0 compute/ncnn/src/mat.cpp | 938 +++++++++++++++++++++ compute/ncnn/src/{ => srcn}/common.h | 2 +- .../ncnn/src/{ => srcn}/conv_sgemm_multithreads.cc | 2 +- .../ncnn/src/{ => srcn}/conv_sgemm_multithreads.h | 2 +- .../ncnn/src/{ => srcn}/conv_sgemm_singlethread.cc | 0 .../ncnn/src/{ => srcn}/conv_sgemm_singlethread.h | 2 +- compute/ncnn/src/{ => srcn}/conv_sparse.cc | 0 compute/ncnn/src/{ => srcn}/conv_sparse.h | 2 +- compute/ncnn/src/{ => srcn}/conv_winograd.cc | 0 compute/ncnn/src/{ => srcn}/conv_winograd.h | 2 +- compute/ncnn/src/{ => srcn}/conv_winograd_batch.cc | 0 compute/ncnn/src/{ => srcn}/conv_winograd_batch.h | 2 +- .../src/{ => srcn}/deconv_sgemm_multithreads.cc | 0 .../src/{ => srcn}/deconv_sgemm_multithreads.h | 2 +- compute/ncnn/src/{ => srcn}/depthwise_conv.cc | 2 +- .../ncnn/src/{ => srcn}/direct_conv_colmajor.cc | 2 +- compute/ncnn/src/{ => srcn}/direct_conv_colmajor.h | 2 +- compute/ncnn/src/{ => srcn}/sgemm_kernel.cc | 0 compute/ncnn/src/{ => srcn}/sgemm_kernel.h | 2 +- compute/ncnn/src/{ => srcn}/sgemm_pack.cc | 2 +- compute/ncnn/src/{ => srcn}/sgemm_pack.h | 2 +- compute/ncnn/src/{ => srcn}/sgemm_singlethread.cc | 0 compute/ncnn/src/{ => srcn}/sgemm_singlethread.h | 0 compute/ncnn/src/{ => srcn}/sgemm_test.cc | 12 +- compute/ncnn/src/{ => srcn}/srcn_conv.cc | 2 +- compute/ncnn/src/{ => srcn}/winograd.h | 0 .../backend/srcn/kernel/TransposeConvLayer.cc | 2 +- 31 files changed, 1707 insertions(+), 22 deletions(-) create mode 100644 compute/ncnn/README.md create mode 100644 compute/ncnn/include/ncnn/mat.h rename compute/ncnn/include/{ => ncnn}/srcn/conv_type.h (100%) rename compute/ncnn/include/{ => ncnn}/srcn/srcn_conv.h (100%) create mode 100644 compute/ncnn/src/mat.cpp rename compute/ncnn/src/{ => srcn}/common.h (99%) rename compute/ncnn/src/{ => srcn}/conv_sgemm_multithreads.cc (99%) rename compute/ncnn/src/{ => srcn}/conv_sgemm_multithreads.h (98%) rename compute/ncnn/src/{ => srcn}/conv_sgemm_singlethread.cc (100%) rename compute/ncnn/src/{ => srcn}/conv_sgemm_singlethread.h (97%) rename compute/ncnn/src/{ => srcn}/conv_sparse.cc (100%) rename compute/ncnn/src/{ => srcn}/conv_sparse.h (97%) rename compute/ncnn/src/{ => srcn}/conv_winograd.cc (100%) rename compute/ncnn/src/{ => srcn}/conv_winograd.h (98%) rename compute/ncnn/src/{ => srcn}/conv_winograd_batch.cc (100%) rename compute/ncnn/src/{ => srcn}/conv_winograd_batch.h (98%) rename compute/ncnn/src/{ => srcn}/deconv_sgemm_multithreads.cc (100%) rename compute/ncnn/src/{ => srcn}/deconv_sgemm_multithreads.h (98%) rename compute/ncnn/src/{ => srcn}/depthwise_conv.cc (99%) rename compute/ncnn/src/{ => srcn}/direct_conv_colmajor.cc (99%) rename compute/ncnn/src/{ => srcn}/direct_conv_colmajor.h (96%) rename compute/ncnn/src/{ => srcn}/sgemm_kernel.cc (100%) rename compute/ncnn/src/{ => srcn}/sgemm_kernel.h (98%) rename compute/ncnn/src/{ => srcn}/sgemm_pack.cc (99%) rename compute/ncnn/src/{ => srcn}/sgemm_pack.h (99%) rename compute/ncnn/src/{ => srcn}/sgemm_singlethread.cc (100%) rename compute/ncnn/src/{ => srcn}/sgemm_singlethread.h (100%) rename compute/ncnn/src/{ => srcn}/sgemm_test.cc (99%) rename compute/ncnn/src/{ => srcn}/srcn_conv.cc (99%) rename compute/ncnn/src/{ => srcn}/winograd.h (100%) diff --git a/compute/ncnn/README.md b/compute/ncnn/README.md new file mode 100644 index 0000000..5c39d24 --- /dev/null +++ b/compute/ncnn/README.md @@ -0,0 +1,9 @@ +### NCNN compute library + +This compute library is based on NCNN project (https://github.com/Tencent/ncnn) with custom optimization + +Current base commit: https://github.com/Tencent/ncnn/commit/0219f507b71bdb945d776c8586c162f2c22bba54 + +Added files for custom optimization is placed on +- Headers: include/ncnn/srcn +- Soruces: src/srcn diff --git a/compute/ncnn/include/ncnn/mat.h b/compute/ncnn/include/ncnn/mat.h new file mode 100644 index 0000000..2a57793 --- /dev/null +++ b/compute/ncnn/include/ncnn/mat.h @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_NCNN_MAT_H__ +#define __NNFW_NCNN_MAT_H__ + +#include +#include +#if __ARM_NEON +#include +#endif + +namespace nnfw +{ +namespace ncnn +{ + +// the three dimension matrix +class Mat +{ +public: + // empty + Mat(); + // vec + Mat(int w, size_t elemsize = 4); + // image + Mat(int w, int h, size_t elemsize = 4); + // dim + Mat(int w, int h, int c, size_t elemsize = 4); + // copy + Mat(const Mat &m); + // external vec + Mat(int w, void *data, size_t elemsize = 4); + // external image + Mat(int w, int h, void *data, size_t elemsize = 4); + // external dim + Mat(int w, int h, int c, void *data, size_t elemsize = 4); + // release + ~Mat(); + // assign + Mat &operator=(const Mat &m); + // set all + void fill(float v); + template void fill(T v); + // deep copy + Mat clone() const; + // reshape vec + Mat reshape(int w) const; + // reshape image + Mat reshape(int w, int h) const; + // reshape dim + Mat reshape(int w, int h, int c) const; + // allocate vec + void create(int w, size_t elemsize = 4); + // allocate image + void create(int w, int h, size_t elemsize = 4); +// allocate dim +#ifdef _MEMORY_TO_TIME_ + void create(int w, int h, int c, size_t elemsize = 4, bool isNew = false); +#else + void create(int w, int h, int c, size_t elemsize = 4); +#endif +#ifdef USE_OPENCL_INSIDE + void create_empity_mat(int _w, int _h, int _c, size_t _elemsize); +#endif + + // refcount++ + void addref(); + // refcount-- + void release(); + + bool empty() const; + size_t total() const; + + // data reference + Mat channel(int c); + const Mat channel(int c) const; + float *row(int y); + const float *row(int y) const; + template T *row(int y); + template const T *row(int y) const; + + // access raw data + template operator T *(); + template operator const T *() const; + + // convenient access float vec element + float &operator[](int i); + const float &operator[](int i) const; + + enum + { + PIXEL_CONVERT_SHIFT = 16, + PIXEL_FORMAT_MASK = 0x0000ffff, + PIXEL_CONVERT_MASK = 0xffff0000, + + PIXEL_RGB = 1, + PIXEL_BGR = (1 << 1), + PIXEL_GRAY = (1 << 2), + PIXEL_RGBA = (1 << 3), + + PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + + PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + + PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + + PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + }; + +#ifdef _MEMORY_TO_TIME_ + static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h); + static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h, int top, + int bottom, int left, int right); +#endif // _MEMORY_TO_TIME_ + + // convenient construct from pixel data + static Mat from_pixels(const unsigned char *pixels, int type, int w, int h); + // convenient construct from pixel data and add the padding && only supports same PIXEL_RGB2BGR + // and PIXEL_BGR2RGB now + static Mat from_pixels(const unsigned char *pixels, int type, int w, int h, int top, int bottom, + int left, int right); + // convenient construct from pixel data and resize to specific size + static Mat from_pixels_resize(const unsigned char *pixels, int type, int w, int h, + int target_width, int target_height); + + // convenient export to pixel data + void to_pixels(unsigned char *pixels, int type); + // convenient export to pixel data and cut the padding && only supports same PIXEL_RGB2BGR and + // PIXEL_BGR2RGB now + void to_pixels(unsigned char *pixels, int type, int top, int bottom, int left, int right); + // convenient export to pixel data and resize to specific size + void to_pixels_resize(unsigned char *pixels, int type, int target_width, int target_height); + + // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip + void substract_mean_normalize(const float *mean_vals, const float *norm_vals); + + // convenient construct from half precisoin floating point data + static Mat from_float16(const unsigned short *data, int size); + + // pointer to the data + void *data; + + // pointer to the reference counter + // when points to user-allocated data, the pointer is NULL + int *refcount; + + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // the dimensionality + int dims; + + int w; + int h; + int c; + + size_t cstep; +}; + +// misc function +// image pixel bilinear resize +void resize_bilinear_c1(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); +void resize_bilinear_c3(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); +void resize_bilinear_c4(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); + +// mat process +enum +{ + BORDER_CONSTANT = 0, + BORDER_REPLICATE = 1, +}; +void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, + float v); +void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right); +void resize_bilinear(const Mat &src, Mat &dst, int w, int h); + +// the alignment of all the allocated buffers +#define MALLOC_ALIGN 16 + +// Aligns a pointer to the specified number of bytes +// ptr Aligned pointer +// n Alignment size that must be a power of two +template static inline _Tp *alignPtr(_Tp *ptr, int n = (int)sizeof(_Tp)) +{ + return (_Tp *)(((size_t)ptr + n - 1) & -n); +} + +// Aligns a buffer size to the specified number of bytes +// The function returns the minimum number that is greater or equal to sz and is divisible by n +// sz Buffer size to align +// n Alignment size that must be a power of two +static inline size_t alignSize(size_t sz, int n) { return (sz + n - 1) & -n; } + +static inline void *fastMalloc(size_t size) +{ + unsigned char *udata = (unsigned char *)malloc(size + sizeof(void *) + MALLOC_ALIGN); + if (!udata) + return 0; + unsigned char **adata = alignPtr((unsigned char **)udata + 1, MALLOC_ALIGN); + adata[-1] = udata; + return adata; +} + +static inline void fastFree(void *ptr) +{ + if (ptr) + { + unsigned char *udata = ((unsigned char **)ptr)[-1]; + free(udata); + } +} + +// exchange-add operation for atomic operations on reference counters +#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) +// atomic increment on the linux version of the Intel(tm) compiler +#define NCNN_XADD(addr, delta) \ + (int)_InterlockedExchangeAdd(const_cast(reinterpret_cast(addr)), delta) +#elif defined __GNUC__ +#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && \ + !defined __EMSCRIPTEN__ && !defined(__CUDACC__) +#ifdef __ATOMIC_ACQ_REL +#define NCNN_XADD(addr, delta) \ + __c11_atomic_fetch_add((_Atomic(int) *)(addr), delta, __ATOMIC_ACQ_REL) +#else +#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int) *)(addr), delta, 4) +#endif +#else +#if defined __ATOMIC_ACQ_REL && !defined __clang__ +// version for gcc >= 4.7 +#define NCNN_XADD(addr, delta) \ + (int)__atomic_fetch_add((unsigned *)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) +#else +#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned *)(addr), (unsigned)(delta)) +#endif +#endif +#elif defined _MSC_VER && !defined RC_INVOKED +#include +#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile *)addr, delta) +#else +static inline void NCNN_XADD(int *addr, int delta) +{ + int tmp = *addr; + *addr += delta; + return tmp; +} +#endif + +inline Mat::Mat() : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0) {} + +inline Mat::Mat(int _w, size_t _elemsize) : data(0), refcount(0), dims(0) { create(_w, _elemsize); } + +inline Mat::Mat(int _w, int _h, size_t _elemsize) : data(0), refcount(0), dims(0) +{ + create(_w, _h, _elemsize); +} + +inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize) : data(0), refcount(0), dims(0) +{ + create(_w, _h, _c, _elemsize); +} + +inline Mat::Mat(const Mat &m) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims) +{ + if (refcount) + NCNN_XADD(refcount, 1); + + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; +} + +inline Mat::Mat(int _w, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(1) +{ + w = _w; + h = 1; + c = 1; + + cstep = w; +} + +inline Mat::Mat(int _w, int _h, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(2) +{ + w = _w; + h = _h; + c = 1; + + cstep = w * h; +} + +inline Mat::Mat(int _w, int _h, int _c, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(3) +{ + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline Mat::~Mat() { release(); } + +inline Mat &Mat::operator=(const Mat &m) +{ + if (this == &m) + return *this; + + if (m.refcount) + NCNN_XADD(m.refcount, 1); + + release(); + + data = m.data; + refcount = m.refcount; + elemsize = m.elemsize; + + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; + + return *this; +} + +inline void Mat::fill(float _v) +{ + int size = total(); + float *ptr = (float *)data; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON + float32x4_t _c = vdupq_n_f32(_v); +#if __aarch64__ + if (nn > 0) + { + asm volatile("0: \n" + "subs %w0, %w0, #1 \n" + "st1 {%4.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "w"(_c) // %4 + : "cc", "memory"); + } +#else + if (nn > 0) + { + asm volatile("0: \n" + "subs %0, #1 \n" + "vst1.f32 {%e4-%f4}, [%1 :128]!\n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "w"(_c) // %4 + : "cc", "memory"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr++ = _v; + } +} + +template inline void Mat::fill(T _v) +{ + int size = total(); + T *ptr = (T *)data; + for (int i = 0; i < size; i++) + { + ptr[i] = _v; + } +} + +inline Mat Mat::clone() const +{ + if (empty()) + return Mat(); + + Mat m; + if (dims == 1) + m.create(w, elemsize); + else if (dims == 2) + m.create(w, h, elemsize); + else if (dims == 3) + m.create(w, h, c, elemsize); + + if (total() > 0) + { + memcpy(m.data, data, total() * elemsize); + } + + return m; +} + +inline Mat Mat::reshape(int _w) const +{ + if (w * h * c != _w) + return Mat(); + + if (dims == 3 && cstep != (size_t)w * h) + { + Mat m; + m.create(_w, elemsize); + + // flatten + for (int i = 0; i < c; i++) + { + const void *ptr = (unsigned char *)data + i * cstep * elemsize; + void *mptr = (unsigned char *)m.data + i * w * h * elemsize; + memcpy(mptr, ptr, w * h * elemsize); + } + + return m; + } + + Mat m = *this; + + m.dims = 1; + m.w = _w; + m.h = 1; + m.c = 1; + + m.cstep = _w; + + return m; +} + +inline Mat Mat::reshape(int _w, int _h) const +{ + if (w * h * c != _w * _h) + return Mat(); + + if (dims == 3 && cstep != (size_t)w * h) + { + Mat m; + m.create(_w, _h, elemsize); + + // flatten + for (int i = 0; i < c; i++) + { + const void *ptr = (unsigned char *)data + i * cstep * elemsize; + void *mptr = (unsigned char *)m.data + i * w * h * elemsize; + memcpy(mptr, ptr, w * h * elemsize); + } + + return m; + } + + Mat m = *this; + + m.dims = 2; + m.w = _w; + m.h = _h; + m.c = 1; + + m.cstep = _w * _h; + + return m; +} + +inline Mat Mat::reshape(int _w, int _h, int _c) const +{ + if (w * h * c != _w * _h * _c) + return Mat(); + + if (dims < 3) + { + if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize) + { + Mat m; + m.create(_w, _h, _c, elemsize); + + // align channel + for (int i = 0; i < _c; i++) + { + const void *ptr = (unsigned char *)data + i * _w * _h * elemsize; + void *mptr = (unsigned char *)m.data + i * m.cstep * m.elemsize; + memcpy(mptr, ptr, _w * _h * elemsize); + } + + return m; + } + } + else if (c != _c) + { + // flatten and then align + Mat tmp = reshape(_w * _h * _c); + return tmp.reshape(_w, _h, _c); + } + + Mat m = *this; + + m.dims = 3; + m.w = _w; + m.h = _h; + m.c = _c; + + m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize; + + return m; +} + +inline void Mat::create(int _w, size_t _elemsize) +{ + if (dims == 1 && w == _w && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 1; + w = _w; + h = 1; + c = 1; + + cstep = w; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +inline void Mat::create(int _w, int _h, size_t _elemsize) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +#ifdef _MEMORY_TO_TIME_ +inline void Mat::create(int _w, int _h, int _c, size_t _elemsize, bool isNew) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + if (!isNew && dims == 3) + { + elemsize = _elemsize; + + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + return; + } + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +#else +inline void Mat::create(int _w, int _h, int _c, size_t _elemsize) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} +#endif //_MEMORY_TO_TIME_ + +#ifdef USE_OPENCL_INSIDE +inline void Mat::create_empity_mat(int _w, int _h, int _c, size_t _elemsize) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + data = NULL; +} +#endif // USE_OPENCL_INSIDE + +inline void Mat::addref() +{ + if (refcount) + NCNN_XADD(refcount, 1); +} + +inline void Mat::release() +{ + if (refcount && NCNN_XADD(refcount, -1) == 1) + fastFree(data); + + data = 0; + + elemsize = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; + + cstep = 0; + + refcount = 0; +} + +inline bool Mat::empty() const { return data == 0 || total() == 0; } + +inline size_t Mat::total() const { return cstep * c; } + +inline Mat Mat::channel(int c) +{ + return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize); +} + +inline const Mat Mat::channel(int c) const +{ + return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize); +} + +inline float *Mat::row(int y) { return (float *)data + w * y; } + +inline const float *Mat::row(int y) const { return (const float *)data + w * y; } + +template inline T *Mat::row(int y) { return (T *)data + w * y; } + +template inline const T *Mat::row(int y) const { return (const T *)data + w * y; } + +template inline Mat::operator T *() { return (T *)data; } + +template inline Mat::operator const T *() const { return (const T *)data; } + +inline float &Mat::operator[](int i) { return ((float *)data)[i]; } + +inline const float &Mat::operator[](int i) const { return ((const float *)data)[i]; } + +} // namespace ncnn +} // namespace nnfw + +#endif // __NNFW_NCNN_MAT_H__ diff --git a/compute/ncnn/include/srcn/conv_type.h b/compute/ncnn/include/ncnn/srcn/conv_type.h similarity index 100% rename from compute/ncnn/include/srcn/conv_type.h rename to compute/ncnn/include/ncnn/srcn/conv_type.h diff --git a/compute/ncnn/include/srcn/srcn_conv.h b/compute/ncnn/include/ncnn/srcn/srcn_conv.h similarity index 100% rename from compute/ncnn/include/srcn/srcn_conv.h rename to compute/ncnn/include/ncnn/srcn/srcn_conv.h diff --git a/compute/ncnn/src/mat.cpp b/compute/ncnn/src/mat.cpp new file mode 100644 index 0000000..071334c --- /dev/null +++ b/compute/ncnn/src/mat.cpp @@ -0,0 +1,938 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mat.h" + +#if __ARM_NEON +#include +#endif // __ARM_NEON + +// Fix for nnfw: comment out cpu.h +//#include "cpu.h" + +namespace nnfw +{ +namespace ncnn +{ + +void Mat::substract_mean_normalize(const float *mean_vals, const float *norm_vals) +{ + int size = w * h; + + if (mean_vals && !norm_vals) + { +// substract mean only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr -= mean; + ptr++; + } + } + } + else if (!mean_vals && norm_vals) + { +// normalize only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vmul.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr *= norm; + ptr++; + } + } + } + else if (mean_vals && norm_vals) + { +// substract mean and normalize +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "dup v2.4s, %w5 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "fmul v0.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "v0", "v1", "v2"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "vdup.f32 q2, %5 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "vmul.f32 q0, q0, q2 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "q0", "q1", "q2"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = (*ptr - mean) * norm; + ptr++; + } + } + } +} + +// convert half precision floating point to float +static float half2float(unsigned short value) +{ + // 1 : 5 : 10 + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + // fprintf(stderr, "%d %d %d\n", sign, exponent, significand); + + // 1 : 8 : 23 + union { + unsigned int u; + float f; + } tmp; + if (exponent == 0) + { + if (significand == 0) + { + // zero + tmp.u = (sign << 31); + } + else + { + // denormal + exponent = 0; + // find non-zero bit + while ((significand & 0x200) == 0) + { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } + else if (exponent == 0x1F) + { + // infinity or NaN + tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); + } + else + { + // normalized + tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + + return tmp.f; +} + +Mat Mat::from_float16(const unsigned short *data, int size) +{ + Mat m(size); + if (m.empty()) + return m; + + float *ptr = m; //.data; + +#if __ARM_NEON && (__ARM_FP & 2) + int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON && (__ARM_FP & 2) +#if __aarch64__ + if (nn > 0) + { + asm volatile("0: \n" + "ld1 {v0.4h}, [%1], #8 \n" + "fcvtl v1.4s, v0.4h \n" + "subs %w0, %w0, #1 \n" + "st1 {v1.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("0: \n" + "pld [%1, #64] \n" + "vld1.s16 {d0}, [%1 :64]! \n" + "vcvt.f32.f16 q1, d0 \n" + "subs %0, #1 \n" + "vst1.f32 {d2-d3}, [%2 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = half2float(*data); + + data++; + ptr++; + } + + return m; +} + +static void copy_make_border_image(const Mat &src, Mat &dst, int top, int left, int type, float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; //.data; + float *outptr = dst; //.data; + + if (type == BORDER_CONSTANT) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = v; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = v; + } + ptr += src.w; + outptr += w; + } + // fill bottom + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } + else if (type == BORDER_REPLICATE) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + ptr += src.w; + outptr += w; + } + // fill bottom + ptr -= src.w; + for (; y < h; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + } +} + +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) +static void copy_make_border_image_inplace(const Mat &src, Mat &dst, int top, int left, int type, + float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; + float *outptr = dst; + + if (type == BORDER_CONSTANT) + { + // fill bottom + int y = src.h + top; + outptr += y * w; + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + + // fill center + y = src.h + top - 1; + outptr = dst; + outptr += y * w; + ptr += (src.h - 1) * src.w; + + for (; y >= top; y--) + { + int x = left + src.w; + for (; x < w; x++) + { + outptr[x] = v; + } + + x = left + src.w - 1; + + for (; x >= left; x--) + { + outptr[x] = ptr[x - left]; + } + + for (x = 0; x < left; x++) + { + outptr[x] = v; + } + ptr -= src.w; + outptr -= w; + } + + // fill top + y = 0; + outptr = dst; + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } +} +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + +void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, + float v) +{ + int w = src.w + left + right; + int h = src.h + top + bottom; + + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + copy_make_border_image(src, dst, top, left, type, v); + } + else if (src.dims == 3) + { + int channels = src.c; + dst.create(w, h, channels); + if (dst.empty()) + return; + + if (src.data != dst.data) + { +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } + } + else + { +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) + for (int q = channels - 1; q >= 0; q--) + { + Mat m = src.channel(q); + Mat borderm = dst.channel(q); + copy_make_border_image_inplace(m, borderm, top, left, type, v); + } +#else +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + } + } +} + +static void copy_cut_border_image(const Mat &src, Mat &dst, int top, int left) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src.row(top) + left; //.data + src.w * top + left; + float *outptr = dst; //.data; + + for (int y = 0; y < h; y++) + { + if (w < 12) + { + for (int x = 0; x < w; x++) + { + outptr[x] = ptr[x]; + } + } + else + { + memcpy(outptr, ptr, w * sizeof(float)); + } + outptr += w; + ptr += src.w; + } +} + +void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right) +{ + int w = src.w - left - right; + int h = src.h - top - bottom; + +#ifndef _MEMORY_TO_TIME_ + if (w == src.w && h == src.h) + { + dst = src; + return; + } +#endif + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + copy_cut_border_image(src, dst, top, left); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +#if !defined(_MEMORY_TO_TIME_) || !defined(_TIME_TO_MEMORY_) +// unroll image channel +#pragma omp parallel for +#endif + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat cutm = dst.channel(q); + + copy_cut_border_image(m, cutm, top, left); + } + } +} + +static void resize_bilinear_image(const Mat &src, Mat &dst, int w, int h) +{ + double scale_x = (double)src.w / w; + double scale_y = (double)src.h / h; + + int *buf = new int[w + h + w * 2 + h * 2]; + + int *xofs = buf; // new int[w]; + int *yofs = buf + w; // new int[h]; + + float *alpha = (float *)(buf + w + h); // new float[w * 2]; + float *beta = (float *)(buf + w + h + w * 2); // new float[h * 2]; + + float fx; + float fy; + int sx; + int sy; + + for (int dx = 0; dx < w; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = fx; // cvFloor(fx); + fx -= sx; + + if (sx >= src.w - 1) + { + sx = src.w - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + alpha[dx * 2] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } + + for (int dy = 0; dy < h; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = fy; // cvFloor(fy); + fy -= sy; + + if (sy >= src.h - 1) + { + sy = src.h - 2; + fy = 1.f; + } + + yofs[dy] = sy; + + beta[dy * 2] = 1.f - fy; + beta[dy * 2 + 1] = fy; + } + + // loop body + Mat rowsbuf0(w + 1); + Mat rowsbuf1(w + 1); + float *rows0 = rowsbuf0; + float *rows1 = rowsbuf1; + + int prev_sy1 = -1; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // hresize one row + float *rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S1p = S1 + sx; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + else + { + // hresize two rows + const float *S0 = src.row(sy); + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows0p = rows0; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + const float *S0np = S0 + sxn; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S0 = vld1_f32(S0p); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S0n = vld1_f32(S0np); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows0p + dx, _rows0); + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + + prev_sy1 = sy + 1; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + + float *rows0p = rows0; + float *rows1p = rows1; + float *Dp = dst.row(dy); + +#if __ARM_NEON + int nn = w >> 3; +#else + int nn = 0; +#endif + int remain = w - (nn << 3); + +#if __ARM_NEON + float32x4_t _b0 = vdupq_n_f32(b0); + float32x4_t _b1 = vdupq_n_f32(b1); + for (; nn > 0; nn--) + { + float32x4_t _rows0 = vld1q_f32(rows0p); + float32x4_t _rows1 = vld1q_f32(rows1p); + + float32x4_t _D = vmulq_f32(_rows0, _b0); + _D = vmlaq_f32(_D, _rows1, _b1); + + vst1q_f32(Dp, _D); + + float32x4_t _rows0n = vld1q_f32(rows0p + 4); + float32x4_t _rows1n = vld1q_f32(rows1p + 4); + + float32x4_t _Dn = vmulq_f32(_rows0n, _b0); + _Dn = vmlaq_f32(_Dn, _rows1n, _b1); + + vst1q_f32(Dp + 4, _Dn); + + Dp += 8; + rows0p += 8; + rows1p += 8; + } +#endif // __ARM_NEON + for (; remain; --remain) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } + + beta += 2; + } + + delete[] buf; +} + +void resize_bilinear(const Mat &src, Mat &dst, int w, int h) +{ + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + resize_bilinear_image(src, dst, w, h); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat resizem = dst.channel(q); + + resize_bilinear_image(m, resizem, w, h); + } + } +} + +} // namespace ncnn +} // namespace nnfw diff --git a/compute/ncnn/src/common.h b/compute/ncnn/src/srcn/common.h similarity index 99% rename from compute/ncnn/src/common.h rename to compute/ncnn/src/srcn/common.h index e8abc14..778a17a 100644 --- a/compute/ncnn/src/common.h +++ b/compute/ncnn/src/srcn/common.h @@ -21,7 +21,7 @@ #include #include -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/conv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc similarity index 99% rename from compute/ncnn/src/conv_sgemm_multithreads.cc rename to compute/ncnn/src/srcn/conv_sgemm_multithreads.cc index 91a4533..9946b6f 100644 --- a/compute/ncnn/src/conv_sgemm_multithreads.cc +++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc @@ -18,7 +18,7 @@ #include #endif -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" #include "sgemm_kernel.h" #include "sgemm_pack.h" diff --git a/compute/ncnn/src/conv_sgemm_multithreads.h b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h similarity index 98% rename from compute/ncnn/src/conv_sgemm_multithreads.h rename to compute/ncnn/src/srcn/conv_sgemm_multithreads.h index 8cb5269..9c9ce74 100644 --- a/compute/ncnn/src/conv_sgemm_multithreads.h +++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__ #define __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" namespace nnfw diff --git a/compute/ncnn/src/conv_sgemm_singlethread.cc b/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc similarity index 100% rename from compute/ncnn/src/conv_sgemm_singlethread.cc rename to compute/ncnn/src/srcn/conv_sgemm_singlethread.cc diff --git a/compute/ncnn/src/conv_sgemm_singlethread.h b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h similarity index 97% rename from compute/ncnn/src/conv_sgemm_singlethread.h rename to compute/ncnn/src/srcn/conv_sgemm_singlethread.h index 06713e6..63f8b6e 100644 --- a/compute/ncnn/src/conv_sgemm_singlethread.h +++ b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__ #define __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" namespace nnfw diff --git a/compute/ncnn/src/conv_sparse.cc b/compute/ncnn/src/srcn/conv_sparse.cc similarity index 100% rename from compute/ncnn/src/conv_sparse.cc rename to compute/ncnn/src/srcn/conv_sparse.cc diff --git a/compute/ncnn/src/conv_sparse.h b/compute/ncnn/src/srcn/conv_sparse.h similarity index 97% rename from compute/ncnn/src/conv_sparse.h rename to compute/ncnn/src/srcn/conv_sparse.h index 3541ff1..a9b3c74 100644 --- a/compute/ncnn/src/conv_sparse.h +++ b/compute/ncnn/src/srcn/conv_sparse.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_CONV_SPARSE_H__ #define __NNFW_SRCN_CONV_SPARSE_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" namespace nnfw diff --git a/compute/ncnn/src/conv_winograd.cc b/compute/ncnn/src/srcn/conv_winograd.cc similarity index 100% rename from compute/ncnn/src/conv_winograd.cc rename to compute/ncnn/src/srcn/conv_winograd.cc diff --git a/compute/ncnn/src/conv_winograd.h b/compute/ncnn/src/srcn/conv_winograd.h similarity index 98% rename from compute/ncnn/src/conv_winograd.h rename to compute/ncnn/src/srcn/conv_winograd.h index d478f94..76c2601 100644 --- a/compute/ncnn/src/conv_winograd.h +++ b/compute/ncnn/src/srcn/conv_winograd.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_CONV_WINOGRAD_H__ #define __NNFW_SRCN_CONV_WINOGRAD_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "winograd.h" #include "sgemm_singlethread.h" diff --git a/compute/ncnn/src/conv_winograd_batch.cc b/compute/ncnn/src/srcn/conv_winograd_batch.cc similarity index 100% rename from compute/ncnn/src/conv_winograd_batch.cc rename to compute/ncnn/src/srcn/conv_winograd_batch.cc diff --git a/compute/ncnn/src/conv_winograd_batch.h b/compute/ncnn/src/srcn/conv_winograd_batch.h similarity index 98% rename from compute/ncnn/src/conv_winograd_batch.h rename to compute/ncnn/src/srcn/conv_winograd_batch.h index 8cf4428..a022d9c 100644 --- a/compute/ncnn/src/conv_winograd_batch.h +++ b/compute/ncnn/src/srcn/conv_winograd_batch.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__ #define __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "winograd.h" #include "sgemm_singlethread.h" diff --git a/compute/ncnn/src/deconv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc similarity index 100% rename from compute/ncnn/src/deconv_sgemm_multithreads.cc rename to compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc diff --git a/compute/ncnn/src/deconv_sgemm_multithreads.h b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h similarity index 98% rename from compute/ncnn/src/deconv_sgemm_multithreads.h rename to compute/ncnn/src/srcn/deconv_sgemm_multithreads.h index 0f0e47b..762f203 100644 --- a/compute/ncnn/src/deconv_sgemm_multithreads.h +++ b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__ #define __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" namespace nnfw diff --git a/compute/ncnn/src/depthwise_conv.cc b/compute/ncnn/src/srcn/depthwise_conv.cc similarity index 99% rename from compute/ncnn/src/depthwise_conv.cc rename to compute/ncnn/src/srcn/depthwise_conv.cc index a06ee70..0e17225 100644 --- a/compute/ncnn/src/depthwise_conv.cc +++ b/compute/ncnn/src/srcn/depthwise_conv.cc @@ -23,7 +23,7 @@ #include #include "common.h" -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/direct_conv_colmajor.cc b/compute/ncnn/src/srcn/direct_conv_colmajor.cc similarity index 99% rename from compute/ncnn/src/direct_conv_colmajor.cc rename to compute/ncnn/src/srcn/direct_conv_colmajor.cc index 394ea6d..3002352 100644 --- a/compute/ncnn/src/direct_conv_colmajor.cc +++ b/compute/ncnn/src/srcn/direct_conv_colmajor.cc @@ -20,7 +20,7 @@ #include #include -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/direct_conv_colmajor.h b/compute/ncnn/src/srcn/direct_conv_colmajor.h similarity index 96% rename from compute/ncnn/src/direct_conv_colmajor.h rename to compute/ncnn/src/srcn/direct_conv_colmajor.h index e50e039..5e15192 100644 --- a/compute/ncnn/src/direct_conv_colmajor.h +++ b/compute/ncnn/src/srcn/direct_conv_colmajor.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__ #define __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/sgemm_kernel.cc b/compute/ncnn/src/srcn/sgemm_kernel.cc similarity index 100% rename from compute/ncnn/src/sgemm_kernel.cc rename to compute/ncnn/src/srcn/sgemm_kernel.cc diff --git a/compute/ncnn/src/sgemm_kernel.h b/compute/ncnn/src/srcn/sgemm_kernel.h similarity index 98% rename from compute/ncnn/src/sgemm_kernel.h rename to compute/ncnn/src/srcn/sgemm_kernel.h index 77d90b1..9e220bc 100644 --- a/compute/ncnn/src/sgemm_kernel.h +++ b/compute/ncnn/src/srcn/sgemm_kernel.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_SGEMM_KERNEL_H__ #define __NNFW_SRCN_SGEMM_KERNEL_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/sgemm_pack.cc b/compute/ncnn/src/srcn/sgemm_pack.cc similarity index 99% rename from compute/ncnn/src/sgemm_pack.cc rename to compute/ncnn/src/srcn/sgemm_pack.cc index 83eb6ca..8767f6c 100644 --- a/compute/ncnn/src/sgemm_pack.cc +++ b/compute/ncnn/src/srcn/sgemm_pack.cc @@ -17,7 +17,7 @@ #include #include -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" namespace nnfw diff --git a/compute/ncnn/src/sgemm_pack.h b/compute/ncnn/src/srcn/sgemm_pack.h similarity index 99% rename from compute/ncnn/src/sgemm_pack.h rename to compute/ncnn/src/srcn/sgemm_pack.h index 6653e73..d64843e 100644 --- a/compute/ncnn/src/sgemm_pack.h +++ b/compute/ncnn/src/srcn/sgemm_pack.h @@ -17,7 +17,7 @@ #ifndef __NNFW_SRCN_SGEMM_PACK_H__ #define __NNFW_SRCN_SGEMM_PACK_H__ -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" namespace nnfw { diff --git a/compute/ncnn/src/sgemm_singlethread.cc b/compute/ncnn/src/srcn/sgemm_singlethread.cc similarity index 100% rename from compute/ncnn/src/sgemm_singlethread.cc rename to compute/ncnn/src/srcn/sgemm_singlethread.cc diff --git a/compute/ncnn/src/sgemm_singlethread.h b/compute/ncnn/src/srcn/sgemm_singlethread.h similarity index 100% rename from compute/ncnn/src/sgemm_singlethread.h rename to compute/ncnn/src/srcn/sgemm_singlethread.h diff --git a/compute/ncnn/src/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc similarity index 99% rename from compute/ncnn/src/sgemm_test.cc rename to compute/ncnn/src/srcn/sgemm_test.cc index f06f057..1b10970 100644 --- a/compute/ncnn/src/sgemm_test.cc +++ b/compute/ncnn/src/srcn/sgemm_test.cc @@ -19,7 +19,7 @@ #include #include -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "srcn/srcn_conv.h" //#include "srcn_sgemm.h" #include "conv_sgemm_singlethread.h" @@ -587,7 +587,7 @@ static int test_sgemm(int m, int n, int k, int loops) const int mb = 180; const int nb = 1440; const int kb = 512; - + const int mr = 4; const int nr = 12; @@ -603,7 +603,7 @@ static int test_sgemm(int m, int n, int k, int loops) const int nm = (m + mb - 1) / mb; const int nn = (n + nb - 1) / nb; const int nk = (k + kb - 1) / kb; - + const int rm = m % mb; const int rn = n % nb; const int rk = k % kb; @@ -615,7 +615,7 @@ static int test_sgemm(int m, int n, int k, int loops) { A[i] = 0.001 + i * 0.000001; } - + float *B = (float *)malloc(k * n * sizeof(float)); if(!B) return 0; @@ -707,14 +707,14 @@ static int test_sgemm(int m, int n, int k, int loops) long long total_size = (long long)m *n * k * 2; printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000); - + free(A); free(B); free(C); //free(PA); //free(PB); - + } #endif diff --git a/compute/ncnn/src/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc similarity index 99% rename from compute/ncnn/src/srcn_conv.cc rename to compute/ncnn/src/srcn/srcn_conv.cc index df2c871..822336f 100644 --- a/compute/ncnn/src/srcn_conv.cc +++ b/compute/ncnn/src/srcn/srcn_conv.cc @@ -18,7 +18,7 @@ #include #endif -#include "srcn/conv_type.h" +#include "ncnn/srcn/conv_type.h" #include "common.h" #include "sgemm_singlethread.h" #include "conv_sgemm_singlethread.h" diff --git a/compute/ncnn/src/winograd.h b/compute/ncnn/src/srcn/winograd.h similarity index 100% rename from compute/ncnn/src/winograd.h rename to compute/ncnn/src/srcn/winograd.h diff --git a/runtimes/neurun/backend/srcn/kernel/TransposeConvLayer.cc b/runtimes/neurun/backend/srcn/kernel/TransposeConvLayer.cc index b1069d3..4a391cf 100644 --- a/runtimes/neurun/backend/srcn/kernel/TransposeConvLayer.cc +++ b/runtimes/neurun/backend/srcn/kernel/TransposeConvLayer.cc @@ -17,7 +17,7 @@ #include "TransposeConvLayer.h" #include "OperationUtils.h" -#include "srcn/srcn_conv.h" +#include "ncnn/srcn/srcn_conv.h" namespace neurun { -- 2.7.4