From: mipsopen-fwu <54090701+mipsopen-fwu@users.noreply.github.com> Date: Fri, 20 Sep 2019 16:52:48 +0000 (+0800) Subject: Merge pull request #15422 from mipsopen-fwu:msa-dev X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~105^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b1ea91d8bd75ec4dbecb95a019953738a2ae86e1;p=platform%2Fupstream%2Fopencv.git Merge pull request #15422 from mipsopen-fwu:msa-dev * Added MSA implementations for mips platforms. Intrinsics for MSA and build scripts for MIPS platforms are added. Signed-off-by: Fei Wu * Removed some unused code in mips.toolchain.cmake. Signed-off-by: Fei Wu * Added comments for mips toolchain configuration and disabled compiling warnings for libpng. Signed-off-by: Fei Wu * Fixed the build error of unsupported opcode 'pause' when mips isa_rev is less than 2. Signed-off-by: Fei Wu * 1. Removed FP16 related item in MSA option defines in OpenCVCompilerOptimizations.cmake. 2. Use CV_CPU_COMPILE_MSA instead of __mips_msa for MSA feature check in cv_cpu_dispatch.h. 3. Removed hasSIMD128() in intrin_msa.hpp. 4. Define CPU_MSA as 150. Signed-off-by: Fei Wu * 1. Removed unnecessary CV_SIMD128_64F guarding in intrin_msa.hpp. 2. Removed unnecessary CV_MSA related code block in dotProd_8u(). Signed-off-by: Fei Wu * 1. Defined CPU_MSA_FLAGS_ON as "-mmsa". 2. Removed CV_SIMD128_64F guardings in intrin_msa.hpp. Signed-off-by: Fei Wu * Removed unused msa_mlal_u16() and msa_mlal_s16 from msa_macros.h. Signed-off-by: Fei Wu --- diff --git a/3rdparty/libpng/CMakeLists.txt b/3rdparty/libpng/CMakeLists.txt index 69bc754..31e7767 100644 --- a/3rdparty/libpng/CMakeLists.txt +++ b/3rdparty/libpng/CMakeLists.txt @@ -46,6 +46,15 @@ if(";${CPU_BASELINE_FINAL};" MATCHES "SSE2" add_definitions(-DPNG_INTEL_SSE) endif() +# set definitions and sources for MIPS +if(";${CPU_BASELINE_FINAL};" MATCHES "MSA") + list(APPEND lib_srcs mips/mips_init.c mips/filter_msa_intrinsics.c) + add_definitions(-DPNG_MIPS_MSA_OPT=2) + ocv_warnings_disable(CMAKE_C_FLAGS -Wshadow) +else() + add_definitions(-DPNG_MIPS_MSA_OPT=0) +endif() + if(PPC64LE OR PPC64) # VSX3 features are backwards compatible if(";${CPU_BASELINE_FINAL};" MATCHES "VSX.*" diff --git a/3rdparty/libpng/mips/filter_msa_intrinsics.c b/3rdparty/libpng/mips/filter_msa_intrinsics.c new file mode 100755 index 0000000..a579179 --- /dev/null +++ b/3rdparty/libpng/mips/filter_msa_intrinsics.c @@ -0,0 +1,808 @@ + +/* filter_msa_intrinsics.c - MSA optimised filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Mandar Sahastrabuddhe, August 2016. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -mfpu=msa on the command line: */ +#if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +/* libpng row pointers are not necessarily aligned to any particular boundary, + * however this code will only work with appropriate alignment. mips/mips_init.c + * checks for this (and will not compile unless it is done). This code uses + * variants of png_aligncast to avoid compiler warnings. + */ +#define png_ptr(type,pointer) png_aligncast(type *,pointer) +#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) + +/* The following relies on a variable 'temp_pointer' being declared with type + * 'type'. This is written this way just to hide the GCC strict aliasing + * warning; note that the code is safe because there never is an alias between + * the input and output pointers. + */ +#define png_ldr(type,pointer)\ + (temp_pointer = png_ptr(type,pointer), *temp_pointer) + +#if PNG_MIPS_MSA_OPT > 0 + +#ifdef CLANG_BUILD + #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b) + + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "lw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "sh %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "sw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #if (__mips == 64) + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint64_t val_m = (val); \ + \ + asm volatile ( \ + "sd %[val_m], %[pdst_sd_m] \n\t" \ + \ + : [pdst_sd_m] "=m" (*pdst_sd_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + #else + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + #endif +#else + #define MSA_SRLI_B(a, b) (a >> b) + +#if (__mips_isa_rev >= 6) + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "lw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "sh %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "sw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #if (__mips == 64) + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint64_t val_m = (val); \ + \ + asm volatile ( \ + "sd %[val_m], %[pdst_sd_m] \n\t" \ + \ + : [pdst_sd_m] "=m" (*pdst_sd_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + #else + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + #endif +#else // !(__mips_isa_rev >= 6) + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "ulw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "ush %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "usw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + + #define SW_ZERO(pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + \ + asm volatile ( \ + "usw $0, %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : \ + ); \ + } +#endif // (__mips_isa_rev >= 6) +#endif + +#define LD_B(RTYPE, psrc) *((RTYPE *) (psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +{ \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ +{ \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ADD2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + out2 = in4 + in5; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ +} +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ + out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ +} +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ +{ \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \ + out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \ +} +#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) + +#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \ +{ \ + v16i8 zero_m = { 0 }; \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \ +} +#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__) + +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ + out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ +} +#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) + +#define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \ +{ \ + RTYPE zero = {0}; \ + \ + out0 = __msa_add_a_h((v8i16) zero, in0); \ + out1 = __msa_add_a_h((v8i16) zero, in1); \ + out2 = __msa_add_a_h((v8i16) zero, in2); \ +} +#define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__) + +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ + out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ +} +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +#define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \ +{ \ + v8i16 _sel_h0, _sel_h1; \ + v16u8 _sel_b0, _sel_b1; \ + _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \ + _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \ + inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \ + inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \ + _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \ + _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \ + inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \ + out0 += inp4; \ +} + +void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + size_t i, cnt, cnt16, cnt32; + size_t istop = row_info->rowbytes; + png_bytep rp = row; + png_const_bytep pp = prev_row; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (i = 0; i < (istop >> 6); i++) + { + LD_UB4(rp, 16, src0, src1, src2, src3); + LD_UB4(pp, 16, src4, src5, src6, src7); + pp += 64; + + ADD4(src0, src4, src1, src5, src2, src6, src3, src7, + src0, src1, src2, src3); + + ST_UB4(src0, src1, src2, src3, rp, 16); + rp += 64; + } + + if (istop & 0x3F) + { + cnt32 = istop & 0x20; + cnt16 = istop & 0x10; + cnt = istop & 0xF; + + if(cnt32) + { + if (cnt16 && cnt) + { + LD_UB4(rp, 16, src0, src1, src2, src3); + LD_UB4(pp, 16, src4, src5, src6, src7); + + ADD4(src0, src4, src1, src5, src2, src6, src3, src7, + src0, src1, src2, src3); + + ST_UB4(src0, src1, src2, src3, rp, 16); + rp += 64; + } + else if (cnt16 || cnt) + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + pp += 32; + src2 = LD_UB(rp + 32); + src6 = LD_UB(pp); + + ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + ST_UB(src2, rp); + rp += 16; + } + else + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + + ADD2(src0, src4, src1, src5, src0, src1); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + } + } + else if (cnt16 && cnt) + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + + ADD2(src0, src4, src1, src5, src0, src1); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + } + else if (cnt16 || cnt) + { + src0 = LD_UB(rp); + src4 = LD_UB(pp); + pp += 16; + + src0 += src4; + + ST_UB(src0, rp); + rp += 16; + } + } +} + +void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + size_t count; + size_t istop = row_info->rowbytes; + png_bytep src = row; + png_bytep nxt = row + 4; + int32_t inp0; + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1; + v16u8 zero = { 0 }; + + istop -= 4; + + inp0 = LW(src); + src += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + + for (count = 0; count < istop; count += 16) + { + src1 = LD_UB(src); + src += 16; + + src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4); + src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8); + src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12); + src1 += src0; + src2 += src1; + src3 += src2; + src4 += src3; + src0 = src4; + ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + size_t count; + size_t istop = row_info->rowbytes; + png_bytep src = row; + png_bytep nxt = row + 3; + int64_t out0; + int32_t inp0, out1; + v16u8 src0, src1, src2, src3, src4, dst0, dst1; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + istop -= 3; + + inp0 = LW(src); + src += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + + for (count = 0; count < istop; count += 12) + { + src1 = LD_UB(src); + src += 12; + + src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3); + src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6); + src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9); + src1 += src0; + src2 += src1; + src3 += src2; + src4 += src3; + src0 = src4; + VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + size_t i; + png_bytep src = row; + png_bytep nxt = row; + png_const_bytep pp = prev_row; + size_t istop = row_info->rowbytes - 4; + int32_t inp0, inp1, out0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 zero = { 0 }; + + inp0 = LW(pp); + pp += 4; + inp1 = LW(src); + src += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + src0 = (v16u8) MSA_SRLI_B(src0, 1); + src1 += src0; + out0 = __msa_copy_s_w((v4i32) src1, 0); + SW(out0, nxt); + nxt += 4; + + for (i = 0; i < istop; i += 16) + { + src2 = LD_UB(pp); + pp += 16; + src6 = LD_UB(src); + src += 16; + + SLDI_B2_0_UB(src2, src6, src3, src7, 4); + SLDI_B2_0_UB(src2, src6, src4, src8, 8); + SLDI_B2_0_UB(src2, src6, src5, src9, 12); + src2 = __msa_ave_u_b(src2, src1); + src6 += src2; + src3 = __msa_ave_u_b(src3, src6); + src7 += src3; + src4 = __msa_ave_u_b(src4, src7); + src8 += src4; + src5 = __msa_ave_u_b(src5, src8); + src9 += src5; + src1 = src9; + ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + size_t i; + png_bytep src = row; + png_bytep nxt = row; + png_const_bytep pp = prev_row; + size_t istop = row_info->rowbytes - 3; + int64_t out0; + int32_t inp0, inp1, out1; + int16_t out2; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + inp0 = LW(pp); + pp += 3; + inp1 = LW(src); + src += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + src0 = (v16u8) MSA_SRLI_B(src0, 1); + src1 += src0; + out2 = __msa_copy_s_h((v8i16) src1, 0); + SH(out2, nxt); + nxt += 2; + nxt[0] = src1[2]; + nxt++; + + for (i = 0; i < istop; i += 12) + { + src2 = LD_UB(pp); + pp += 12; + src6 = LD_UB(src); + src += 12; + + SLDI_B2_0_UB(src2, src6, src3, src7, 3); + SLDI_B2_0_UB(src2, src6, src4, src8, 6); + SLDI_B2_0_UB(src2, src6, src5, src9, 9); + src2 = __msa_ave_u_b(src2, src1); + src6 += src2; + src3 = __msa_ave_u_b(src3, src6); + src7 += src3; + src4 = __msa_ave_u_b(src4, src7); + src8 += src4; + src5 = __msa_ave_u_b(src5, src8); + src9 += src5; + src1 = src9; + VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +void png_read_filter_row_paeth4_msa(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + int32_t count, rp_end; + png_bytep nxt; + png_const_bytep prev_nxt; + int32_t inp0, inp1, res0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, dst0, dst1; + v8i16 vec0, vec1, vec2; + v16u8 zero = { 0 }; + + nxt = row; + prev_nxt = prev_row; + + inp0 = LW(nxt); + inp1 = LW(prev_nxt); + prev_nxt += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + + src1 += src0; + res0 = __msa_copy_s_w((v4i32) src1, 0); + + SW(res0, nxt); + nxt += 4; + + /* Remainder */ + rp_end = row_info->rowbytes - 4; + + for (count = 0; count < rp_end; count += 16) + { + src2 = LD_UB(prev_nxt); + prev_nxt += 16; + src6 = LD_UB(prev_row); + prev_row += 16; + src10 = LD_UB(nxt); + + SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4); + SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8); + SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12); + ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); + ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); + ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); + ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); + src1 = src13; + ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_paeth3_msa(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + int32_t count, rp_end; + png_bytep nxt; + png_const_bytep prev_nxt; + int64_t out0; + int32_t inp0, inp1, out1; + int16_t out2; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 src10, src11, src12, src13; + v8i16 vec0, vec1, vec2; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + nxt = row; + prev_nxt = prev_row; + + inp0 = LW(nxt); + inp1 = LW(prev_nxt); + prev_nxt += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + + src1 += src0; + out2 = __msa_copy_s_h((v8i16) src1, 0); + + SH(out2, nxt); + nxt += 2; + nxt[0] = src1[2]; + nxt++; + + /* Remainder */ + rp_end = row_info->rowbytes - 3; + + for (count = 0; count < rp_end; count += 12) + { + src2 = LD_UB(prev_nxt); + prev_nxt += 12; + src6 = LD_UB(prev_row); + prev_row += 12; + src10 = LD_UB(nxt); + + SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3); + SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6); + SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9); + ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); + ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); + ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); + ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); + src1 = src13; + VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +#endif /* PNG_MIPS_MSA_OPT > 0 */ +#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/3rdparty/libpng/mips/mips_init.c b/3rdparty/libpng/mips/mips_init.c new file mode 100644 index 0000000..6a061cc --- /dev/null +++ b/3rdparty/libpng/mips/mips_init.c @@ -0,0 +1,127 @@ + +/* mips_init.c - MSA optimised filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Mandar Sahastrabuddhe, 2016. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_MIPS_MSA_OPT > 0 +#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the MIPS + * MSA instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_MIPS_MSA_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_msa function. There + * are a number of implementations in contrib/mips-msa, but the only one that + * has partial support is contrib/mips-msa/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_MIPS_MSA_FILE +# ifdef __linux__ +# define PNG_MIPS_MSA_FILE "contrib/mips-msa/linux.c" +# endif +#endif + +#ifdef PNG_MIPS_MSA_FILE + +#include /* for sig_atomic_t */ +static int png_have_msa(png_structp png_ptr); +#include PNG_MIPS_MSA_FILE + +#else /* PNG_MIPS_MSA_FILE */ +# error "PNG_MIPS_MSA_FILE undefined: no support for run-time MIPS MSA checks" +#endif /* PNG_MIPS_MSA_FILE */ +#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */ + +#ifndef PNG_ALIGNED_MEMORY_SUPPORTED +# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED" +#endif + +void +png_init_filter_functions_msa(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for MIPS_MSA_API, the call to + * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined + * the check is only performed if the API has not set the MSA option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_MIPS_MSA_API_SUPPORTED + switch ((pp->options >> PNG_MIPS_MSA) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_msa = -1; /* not checked */ + + if (no_msa < 0) + no_msa = !png_have_msa(pp); + + if (no_msa) + return; + } +#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */ + break; + + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } + /* IMPORTANT: any new external functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_msa; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_msa; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa; + } + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa; + } +#else + (void)pp; + (void)bpp; +#endif /* PNG_MIPS_MSA_API_SUPPORTED */ +} +#endif /* PNG_MIPS_MSA_OPT > 0 */ +#endif /* READ */ diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h index de026a1..a16c0bb 100644 --- a/3rdparty/libwebp/src/dsp/msa_macro.h +++ b/3rdparty/libwebp/src/dsp/msa_macro.h @@ -73,7 +73,7 @@ static inline TYPE FUNC_NAME(const void* const psrc) { \ const uint8_t* const psrc_m = (const uint8_t*)psrc; \ TYPE val_m; \ - asm volatile ( \ + __asm__ volatile ( \ "" #INSTR " %[val_m], %[psrc_m] \n\t" \ : [val_m] "=r" (val_m) \ : [psrc_m] "m" (*psrc_m)); \ @@ -86,7 +86,7 @@ static inline void FUNC_NAME(TYPE val, void* const pdst) { \ uint8_t* const pdst_m = (uint8_t*)pdst; \ TYPE val_m = val; \ - asm volatile ( \ + __asm__ volatile ( \ " " #INSTR " %[val_m], %[pdst_m] \n\t" \ : [pdst_m] "=m" (*pdst_m) \ : [val_m] "r" (val_m)); \ diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 92cce39..28867f8 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -45,6 +45,7 @@ set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F") list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CEL;AVX512_ICL") list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) +list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -339,6 +340,11 @@ elseif(ARM OR AARCH64) ocv_update(CPU_FP16_IMPLIES "NEON") set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") endif() +elseif(MIPS) + ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA") + ocv_update(CPU_MSA_FLAGS_ON "-mmsa") + set(CPU_BASELINE "MSA" CACHE STRING "${HELP_CPU_BASELINE}") elseif(PPC64LE) ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3") ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp") diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index be64e55..a70f6af 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le") set(PPC64LE 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") set(PPC64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)") + set(MIPS 1) endif() # Workaround for 32-bit operating systems on x86_64/aarch64 processor diff --git a/cmake/checks/cpu_msa.cpp b/cmake/checks/cpu_msa.cpp new file mode 100755 index 0000000..a97e387 --- /dev/null +++ b/cmake/checks/cpu_msa.cpp @@ -0,0 +1,23 @@ +#include + +#if defined(__mips_msa) +# include +# define CV_MSA 1 +#endif + +#if defined CV_MSA +int test() +{ + const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f }; + v4f32 val = (v4f32)__msa_ld_w((const float*)(src), 0); + return __msa_copy_s_w(__builtin_msa_ftint_s_w (val), 0); +} +#else +#error "MSA is not supported" +#endif + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 483cc8f..818087f 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -152,6 +152,11 @@ # define CV_VSX3 1 #endif +#ifdef CV_CPU_COMPILE_MSA +# include "hal/msa_macros.h" +# define CV_MSA 1 +#endif + #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX @@ -319,3 +324,7 @@ struct VZeroUpperGuard { #ifndef CV_VSX3 # define CV_VSX3 0 #endif + +#ifndef CV_MSA +# define CV_MSA 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 90e0e9b..2c82282 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -420,6 +420,27 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA +# define CV_TRY_MSA 1 +# define CV_CPU_FORCE_MSA 1 +# define CV_CPU_HAS_SUPPORT_MSA 1 +# define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA +# define CV_TRY_MSA 1 +# define CV_CPU_FORCE_MSA 0 +# define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA)) +# define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args) +# define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args) +#else +# define CV_TRY_MSA 0 +# define CV_CPU_FORCE_MSA 0 +# define CV_CPU_HAS_SUPPORT_MSA 0 +# define CV_CPU_CALL_MSA(fn, args) +# define CV_CPU_CALL_MSA_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...) CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX # define CV_TRY_VSX 1 # define CV_CPU_FORCE_VSX 1 diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 7b81a17..d534a7f 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -258,6 +258,8 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_NEON 100 +#define CV_CPU_MSA 150 + #define CV_CPU_VSX 200 #define CV_CPU_VSX3 201 @@ -308,6 +310,8 @@ enum CpuFeatures { CPU_NEON = 100, + CPU_MSA = 150, + CPU_VSX = 200, CPU_VSX3 = 201, diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index a96cfbd..615887c 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -165,9 +165,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_NEON # undef CV_VSX # undef CV_FP16 +# undef CV_MSA #endif -#if CV_SSE2 || CV_NEON || CV_VSX +#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA #define CV__SIMD_FORWARD 128 #include "opencv2/core/hal/intrin_forward.hpp" #endif @@ -185,6 +186,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #include "opencv2/core/hal/intrin_vsx.hpp" +#elif CV_MSA + +#include "opencv2/core/hal/intrin_msa.hpp" + #else #define CV_SIMD128_CPP 1 diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp new file mode 100755 index 0000000..6a96484 --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -0,0 +1,1669 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_INTRIN_MSA_HPP +#define OPENCV_HAL_INTRIN_MSA_HPP + +#include +#include "opencv2/core/utility.hpp" + +namespace cv +{ + +//! @cond IGNORED +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#define CV_SIMD128 1 + +//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers. +//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers. +#define CV_SIMD128_64F 1 + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + + v_uint8x16() : val(msa_dupq_n_u8(0)) {} + explicit v_uint8x16(v16u8 v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = msa_ld1q_u8(v); + } + uchar get0() const + { + return msa_getq_lane_u8(val, 0); + } + + v16u8 val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() : val(msa_dupq_n_s8(0)) {} + explicit v_int8x16(v16i8 v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = msa_ld1q_s8(v); + } + schar get0() const + { + return msa_getq_lane_s8(val, 0); + } + + v16i8 val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() : val(msa_dupq_n_u16(0)) {} + explicit v_uint16x8(v8u16 v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = msa_ld1q_u16(v); + } + ushort get0() const + { + return msa_getq_lane_u16(val, 0); + } + + v8u16 val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() : val(msa_dupq_n_s16(0)) {} + explicit v_int16x8(v8i16 v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = msa_ld1q_s16(v); + } + short get0() const + { + return msa_getq_lane_s16(val, 0); + } + + v8i16 val; +}; + +struct v_uint32x4 +{ + typedef unsigned int lane_type; + enum { nlanes = 4 }; + + v_uint32x4() : val(msa_dupq_n_u32(0)) {} + explicit v_uint32x4(v4u32 v) : val(v) {} + v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3) + { + unsigned int v[] = {v0, v1, v2, v3}; + val = msa_ld1q_u32(v); + } + unsigned int get0() const + { + return msa_getq_lane_u32(val, 0); + } + + v4u32 val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() : val(msa_dupq_n_s32(0)) {} + explicit v_int32x4(v4i32 v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = msa_ld1q_s32(v); + } + int get0() const + { + return msa_getq_lane_s32(val, 0); + } + v4i32 val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() : val(msa_dupq_n_f32(0.0f)) {} + explicit v_float32x4(v4f32 v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = msa_ld1q_f32(v); + } + float get0() const + { + return msa_getq_lane_f32(val, 0); + } + v4f32 val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() : val(msa_dupq_n_u64(0)) {} + explicit v_uint64x2(v2u64 v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + uint64 v[] = {v0, v1}; + val = msa_ld1q_u64(v); + } + uint64 get0() const + { + return msa_getq_lane_u64(val, 0); + } + v2u64 val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() : val(msa_dupq_n_s64(0)) {} + explicit v_int64x2(v2i64 v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + int64 v[] = {v0, v1}; + val = msa_ld1q_s64(v); + } + int64 get0() const + { + return msa_getq_lane_s64(val, 0); + } + v2i64 val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + + v_float64x2() : val(msa_dupq_n_f64(0.0f)) {} + explicit v_float64x2(v2f64 v) : val(v) {} + v_float64x2(double v0, double v1) + { + double v[] = {v0, v1}; + val = msa_ld1q_f64(v); + } + double get0() const + { + return msa_getq_lane_f64(val, 0); + } + v2f64 val; +}; + +#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \ +inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \ +inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \ +inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \ +inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \ +inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \ +inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \ +inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \ +inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \ +inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \ +inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \ +inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); } + +OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32) +OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64) + +#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \ +inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + return _Tpvec(mov(a.val, b.val)); \ +} \ +template inline \ +_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + return _Tpvec(rshr(a.val, b.val, n)); \ +} + +OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16) +OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16) +OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32) +OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32) +OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64) +OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64) +OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16) +OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32) + +#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \ +inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = mov(a.val); \ + msa_st1_##suffix(ptr, a1); \ +} \ +template inline \ +void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = rshr(a.val, n); \ + msa_st1_##suffix(ptr, a1); \ +} + +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32) + +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint8x16(msa_pack_u16(a.val, b.val)); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val))); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val)); + v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val)); + return v_uint8x16(msa_pack_u16(abcd, efgh)); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + v4f32 v0 = v.val; + v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0); + res = msa_mlaq_lane_f32(res, m1.val, v0, 1); + res = msa_mlaq_lane_f32(res, m2.val, v0, 2); + res = msa_mlaq_lane_f32(res, m3.val, v0, 3); + return v_float32x4(res); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& a) +{ + v4f32 v0 = v.val; + v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0); + res = msa_mlaq_lane_f32(res, m1.val, v0, 1); + res = msa_mlaq_lane_f32(res, m2.val, v0, 2); + res = msa_addq_f32(res, a.val); + return v_float32x4(res); +} + +#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64) + +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \ +inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ +} \ +inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ +{a = a * b; return a; } + +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4) + +// Multiply and expand +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + v16i8 a_lo, a_hi, b_lo, b_hi; + + ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi); + ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi); + c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo)); + d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi)); +} + +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + v16u8 a_lo, a_hi, b_lo, b_hi; + + ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi); + ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi); + c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo)); + d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi)); +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + v8i16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); + ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi); + c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)); + d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + v8u16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); + ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi); + c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)); + d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + v4u32 a_lo, a_hi, b_lo, b_hi; + + ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi); + ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi); + c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo)); + d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi)); +} + +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ + v8i16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); + ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi); + + return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)), + msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16)); +} + +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ + v8u16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); + ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi); + + return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)), + msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16)); +} + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + return v_int32x4(msa_dotp_s_w(a.val, b.val)); +} + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); +} + +#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \ +inline _Tpvec operator ~ (const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \ +} + +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64) + +#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); +} + +/* v_abs */ +#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ +inline _Tpuvec v_abs(const _Tpsvec& a) \ +{ \ + return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \ +} + +OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8) +OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16) +OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32) + +/* v_abs(float), v_sqrt, v_invsqrt */ +#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a) \ +{ \ + return _Tpvec(intrin(a.val)); \ +} + +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64) + +#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \ +inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +{ \ + return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \ +} \ +inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ +{ \ + a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64) + +inline v_float64x2 operator ~ (const v_float64x2& a) +{ + return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); +} + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64) + +#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); } + +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64) + +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); } +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); } + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16) + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64) + +/** Saturating absolute difference **/ +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16) + +#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \ +inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \ +} + +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32) + +/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */ +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val)); +} + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val)); +} + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val)); +} + +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val)); +} + +inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_fma(a, b, c); +} + +// trade efficiency for convenience +#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \ +template inline _Tpvec v_rshr(const _Tpvec& a) \ +{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); } + +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64) + +/* v_rotate_right, v_rotate_left */ +#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ \ + return a; \ +} \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ \ + CV_UNUSED(b); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64) + +#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + int n = _Tpvec::nlanes; \ + for( int i = 0; i < (n/2); i++ ) \ + ptr[i] = a.val[i]; \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + int n = _Tpvec::nlanes; \ + for( int i = 0; i < (n/2); i++ ) \ + ptr[i] = a.val[i+(n/2)]; \ +} + +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \ +inline unsigned short v_reduce_##func(const v_uint16x8& a) \ +{ \ + v8u16 a_lo, a_hi; \ + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \ + v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \ + v4u32 b_lo, b_hi; \ + ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \ + v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \ + return (unsigned short)cfunc(c[0], c[1]); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \ +inline short v_reduce_##func(const v_int16x8& a) \ +{ \ + v8i16 a_lo, a_hi; \ + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \ + v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \ + v4i32 b_lo, b_hi; \ + ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \ + v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \ + return (short)cfunc(c[0], c[1]); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + return (scalartype)msa_sum_##suffix(a.val); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32) + +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); } +inline int64 v_reduce_sum(const v_int64x2& a) +{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); } +inline double v_reduce_sum(const v_float64x2& a) +{ + return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1); +} + +/* v_reduce_sum4, v_reduce_sad */ +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3 + v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3 + + return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))))); +} + +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + v16u8 t0 = msa_abdq_u8(a.val, b.val); + v8u16 t1 = msa_paddlq_u8(t0); + v4u32 t2 = msa_paddlq_u16(t1); + return msa_sum_u32(t2); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val)); + v8u16 t1 = msa_paddlq_u8(t0); + v4u32 t2 = msa_paddlq_u16(t1); + return msa_sum_u32(t2); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + v8u16 t0 = msa_abdq_u16(a.val, b.val); + v4u32 t1 = msa_paddlq_u16(t0); + return msa_sum_u32(t1); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val)); + v4u32 t1 = msa_paddlq_u16(t0); + return msa_sum_u32(t1); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + v4u32 t0 = msa_abdq_u32(a.val, b.val); + return msa_sum_u32(t0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val)); + return msa_sum_u32(t0); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + v4f32 t0 = msa_abdq_f32(a.val, b.val); + return msa_sum_f32(t0); +} + +/* v_popcount */ +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \ +inline v_uint8x16 v_popcount(const _Tpvec& a) \ +{ \ + v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \ + return v_uint8x16(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \ +inline v_uint16x8 v_popcount(const _Tpvec& a) \ +{ \ + v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \ + return v_uint16x8(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \ + return v_uint32x4(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \ +inline v_uint64x2 v_popcount(const _Tpvec& a) \ +{ \ + v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \ + return v_uint64x2(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2) + +inline int v_signmask(const v_uint8x16& a) +{ + v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100)); + v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0)); + v8u16 v1 = msa_paddlq_u8(v0); + v4u32 v2 = msa_paddlq_u16(v1); + v2u64 v3 = msa_paddlq_u32(v2); + return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8); +} +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } + +inline int v_signmask(const v_uint16x8& a) +{ + v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000)); + v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0)); + v4u32 v1 = msa_paddlq_u16(v0); + v2u64 v2 = msa_paddlq_u32(v1); + return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4); +} +inline int v_signmask(const v_int16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } + +inline int v_signmask(const v_uint32x4& a) +{ + v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000)); + v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0)); + v2u64 v1 = msa_paddlq_u32(v0); + return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2); +} +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } + +inline int v_signmask(const v_uint64x2& a) +{ + v2u64 v0 = msa_shrq_n_u64(a.val, 63); + return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1); +} +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } + +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); } + +#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \ +inline bool v_check_all(const v_##_Tpvec& a) \ +{ \ + _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \ + v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ + return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \ +} \ +inline bool v_check_any(const v_##_Tpvec& a) \ +{ \ + _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \ + v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ + return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \ +} + +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63) + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +inline bool v_check_any(const v_int8x16& a) +{ return v_check_any(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_any(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } + +inline bool v_check_all(const v_int64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_all(const v_float64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_int64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_float64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } + +/* v_select */ +#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \ + MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \ +} + +OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8) + +#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + b0.val = msa_paddlq_##suffix(a_lo); \ + b1.val = msa_paddlq_##suffix(a_hi); \ +} \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + return _Tpwvec(msa_paddlq_##suffix(a_lo)); \ +} \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + return _Tpwvec(msa_paddlq_##suffix(a_hi)); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \ +} + +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8) +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16) +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]}); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]}); +} + +/* v_zip, v_combine_low, v_combine_high, v_recombine */ +#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ +{ \ + b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ +} \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ + d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ +} + +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64) + +/* v_extract */ +#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \ +} + +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64) + +/* v_round, v_floor, v_ceil, v_trunc */ +inline v_int32x4 v_round(const v_float32x4& a) +{ + return v_int32x4(msa_cvttintq_s32_f32(a.val)); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + v4i32 a1 = msa_cvttintq_s32_f32(a.val); + return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val)))); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + v4i32 a1 = msa_cvttintq_s32_f32(a.val); + return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1))))); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ + return v_int32x4(msa_cvttruncq_s32_f32(a.val)); +} + +inline v_int32x4 v_round(const v_float64x2& a) +{ + return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val))); +} + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + v2f64 a1 = msa_cvtrintq_f64(a.val); + return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + v2f64 a1 = msa_cvtrintq_f64(a.val); + return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ + return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0))); +} + +#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, \ + _Tpvec& b2, _Tpvec& b3) \ +{ \ + _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ + _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ + b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ + b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ + b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ + b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ +} + +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32) + +#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ +{ \ + msa_ld2q_##suffix(ptr, &a.val, &b.val); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ +{ \ + msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ + v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + msa_st2q_##suffix(ptr, a.val, b.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + msa_st3q_##suffix(ptr, a.val, b.val, c.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, const v_##_Tpvec& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \ +} + +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64) + +/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */ +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(msa_cvtfintq_f32_s32(a.val)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f))); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val))); +} + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ + return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val))); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + return v_float64x2(msa_cvtflq_f64_f32(a.val)); +} + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + return v_float64x2(msa_cvtfhq_f64_f32(a.val)); +} + +////////////// Lookup table access //////////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[ 0]], + tab[idx[ 1]], + tab[idx[ 2]], + tab[idx[ 3]], + tab[idx[ 4]], + tab[idx[ 5]], + tab[idx[ 6]], + tab[idx[ 7]], + tab[idx[ 8]], + tab[idx[ 9]], + tab[idx[10]], + tab[idx[11]], + tab[idx[12]], + tab[idx[13]], + tab[idx[14]], + tab[idx[15]] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[4]], + tab[idx[4] + 1], + tab[idx[5]], + tab[idx[5] + 1], + tab[idx[6]], + tab[idx[6] + 1], + tab[idx[7]], + tab[idx[7] + 1] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[2] + 2], + tab[idx[2] + 3], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[3] + 2], + tab[idx[3] + 3] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } + + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]], + tab[idx[4]], + tab[idx[5]], + tab[idx[6]], + tab[idx[7]] + }; + return v_int16x8(msa_ld1q_s16(elems)); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1] + }; + return v_int16x8(msa_ld1q_s16(elems)); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1]))); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_int32x4(msa_ld1q_s32(elems)); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1]))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(msa_ld1q_s32(tab + idx[0])); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]]))); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(msa_ld1q_s64(tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_float32x4(msa_ld1q_f32(elems)); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + uint64 CV_DECL_ALIGNED(32) elems[2] = + { + *(uint64*)(tab + idx[0]), + *(uint64*)(tab + idx[1]) + }; + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems))); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + return v_float32x4(msa_ld1q_f32(tab + idx[0])); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + unsigned CV_DECL_ALIGNED(32) elems[4] = + { + tab[msa_getq_lane_s32(idxvec.val, 0)], + tab[msa_getq_lane_s32(idxvec.val, 1)], + tab[msa_getq_lane_s32(idxvec.val, 2)], + tab[msa_getq_lane_s32(idxvec.val, 3)] + }; + return v_uint32x4(msa_ld1q_u32(elems)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2])); + v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3])); + x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02)))); + y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02)))); +} + +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val)); + return c; +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) +{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val)); + return c; +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } + +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + v_int32x4 c; + c.val[0] = vec.val[0]; + c.val[1] = vec.val[2]; + c.val[2] = vec.val[1]; + c.val[3] = vec.val[3]; + return c; +} + +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val)); + return c; +} + +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[idx[0]], + tab[idx[1]] + }; + return v_float64x2(msa_ld1q_f64(elems)); +} + +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2(msa_ld1q_f64(tab + idx[0])); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_float64x2(tab[idx[0]], tab[idx[1]]); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + v2f64 xy0 = msa_ld1q_f64(tab + idx[0]); + v2f64 xy1 = msa_ld1q_f64(tab + idx[1]); + x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0)))); + y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0)))); +} + +////// FP16 suport /////// +#if CV_FP16 +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ +#ifndef msa_ld1_f16 + v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr); +#else + v4f16 v = msa_ld1_f16((const __fp16*)ptr); +#endif + return v_float32x4(msa_cvt_f32_f16(v)); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + v4f16 hv = msa_cvt_f16_f32(v.val); + +#ifndef msa_st1_f16 + msa_st1_s16((short*)ptr, (int16x4_t)hv); +#else + msa_st1_f16((__fp16*)ptr, hv); +#endif +} +#else +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + float buf[4]; + for( int i = 0; i < 4; i++ ) + buf[i] = (float)ptr[i]; + return v_load(buf); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + float buf[4]; + v_store(buf, v); + for( int i = 0; i < 4; i++ ) + ptr[i] = (float16_t)buf[i]; +} +#endif + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} + +#endif diff --git a/modules/core/include/opencv2/core/hal/msa_macros.h b/modules/core/include/opencv2/core/hal/msa_macros.h new file mode 100755 index 0000000..97e4f4b --- /dev/null +++ b/modules/core/include/opencv2/core/hal/msa_macros.h @@ -0,0 +1,1558 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_HAL_MSA_MACROS_H +#define OPENCV_CORE_HAL_MSA_MACROS_H + +#ifdef __mips_msa +#include "msa.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Define 64 bits vector types */ +typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8))); +typedef short v4i16 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8))); +typedef int v2i32 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8))); +typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8))); +typedef float v2f32 __attribute__ ((vector_size(8), aligned(8))); +typedef double v1f64 __attribute__ ((vector_size(8), aligned(8))); + + +/* Load values from the given memory a 64-bit vector. */ +#define msa_ld1_s8(__a) (*((v8i8*)(__a))) +#define msa_ld1_s16(__a) (*((v4i16*)(__a))) +#define msa_ld1_s32(__a) (*((v2i32*)(__a))) +#define msa_ld1_s64(__a) (*((v1i64*)(__a))) +#define msa_ld1_u8(__a) (*((v8u8*)(__a))) +#define msa_ld1_u16(__a) (*((v4u16*)(__a))) +#define msa_ld1_u32(__a) (*((v2u32*)(__a))) +#define msa_ld1_u64(__a) (*((v1u64*)(__a))) +#define msa_ld1_f32(__a) (*((v2f32*)(__a))) +#define msa_ld1_f64(__a) (*((v1f64*)(__a))) + +/* Load values from the given memory address to a 128-bit vector */ +#define msa_ld1q_s8(__a) ((v16i8)__builtin_msa_ld_b(__a, 0)) +#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0)) +#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0)) +#define msa_ld1q_u8(__a) ((v16u8)__builtin_msa_ld_b(__a, 0)) +#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0)) +#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0)) +#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0)) + +/* Store 64bits vector elments values to the given memory address. */ +#define msa_st1_s8(__a, __b) (*((v8i8*)(__a)) = __b) +#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b) +#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b) +#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b) +#define msa_st1_u8(__a, __b) (*((v8u8*)(__a)) = __b) +#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b) +#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b) +#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b) +#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b) +#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b) + +/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */ +#define msa_st1q_s8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0)) +#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0)) +#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) +#define msa_st1q_u8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0)) +#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0)) +#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) +#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) + +/* Store the value of the element with the index __c in vector __a to the given memory address __a. */ +#define msa_st1_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __b[__c]) +#define msa_st1_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c]) +#define msa_st1_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c]) +#define msa_st1q_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c)) +#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c)) +#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c)) +#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c)) +#define msa_st1q_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c)) +#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c)) +#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c)) +#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c)) +#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c]) +#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c]) + +/* Duplicate elements for 64-bit doubleword vectors */ +#define msa_dup_n_s8(__a) ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0)) +#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0)) +#define msa_dup_n_s32(__a) ((v2i32){__a, __a}) +#define msa_dup_n_s64(__a) ((v1i64){__a}) +#define msa_dup_n_u8(__a) ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0)) +#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0)) +#define msa_dup_n_u32(__a) ((v2u32){__a, __a}) +#define msa_dup_n_u64(__a) ((v1u64){__a}) +#define msa_dup_n_f32(__a) ((v2f32){__a, __a}) +#define msa_dup_n_f64(__a) ((v1f64){__a}) + +/* Duplicate elements for 128-bit quadword vectors */ +#define msa_dupq_n_s8(__a) (__builtin_msa_fill_b((int32_t)(__a))) +#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a))) +#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a))) +#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a))) +#define msa_dupq_n_u8(__a) ((v16u8)__builtin_msa_fill_b((int32_t)(__a))) +#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a))) +#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a))) +#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a))) +#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a}) +#define msa_dupq_n_f64(__a) ((v2f64){__a, __a}) +#define msa_dupq_lane_s8(__a, __b) (__builtin_msa_splat_b(__a, __b)) +#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b)) +#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b)) +#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b)) +#define msa_dupq_lane_u8(__a, __b) ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b)) +#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b)) +#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b)) +#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b)) + +/* Create a 64 bits vector */ +#define msa_create_s8(__a) ((v8i8)((uint64_t)(__a))) +#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a))) +#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a))) +#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a))) +#define msa_create_u8(__a) ((v8u8)((uint64_t)(__a))) +#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a))) +#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a))) +#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a))) +#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a))) +#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a))) + +/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */ +/*Transform v8i8 to v8i16*/ +#define msa_movl_s8(__a) \ +((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \ + (__a)[4], (__a)[5], (__a)[6], (__a)[7]}) + +/*Transform v8u8 to v8u16*/ +#define msa_movl_u8(__a) \ +((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \ + (__a)[4], (__a)[5], (__a)[6], (__a)[7]}) + +/*Transform v4i16 to v8i16*/ +#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]}) + +/*Transform v2i32 to v4i32*/ +#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]}) + +/*Transform v4u16 to v8u16*/ +#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]}) + +/*Transform v2u32 to v4u32*/ +#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]}) + +/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */ +#define msa_movn_s16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_s32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_s64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* qmovn */ +#define msa_qmovn_s16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_s32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_s64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* qmovun */ +#define msa_qmovun_s16(__a) \ +({ \ + v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qmovun_s32(__a) \ +({ \ + v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qmovun_s64(__a) \ +({ \ + v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */ +#define msa_shrn_n_s16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_s32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_s64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */ +#define msa_rshrn_n_s16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_s32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_s64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */ +#define msa_qrshrn_n_s16(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_s32(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_s64(__a, __b) \ +({ \ + v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u16(__a, __b) \ +({ \ + v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u32(__a, __b) \ +({ \ + v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u64(__a, __b) \ +({ \ + v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. + Input is signed and outpus is unsigned. */ +#define msa_qrshrun_n_s16(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrun_n_s32(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrun_n_s64(__a, __b) \ +({ \ + v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* pack */ +#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a))) +#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a))) +#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a))) +#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a))) +#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a))) +#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a))) + +/* qpack */ +#define msa_qpack_s16(__a, __b) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7))) +#define msa_qpack_s32(__a, __b) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15))) +#define msa_qpack_s64(__a, __b) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31))) +#define msa_qpack_u16(__a, __b) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7))) +#define msa_qpack_u32(__a, __b) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15))) +#define msa_qpack_u64(__a, __b) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31))) + +/* qpacku */ +#define msa_qpacku_s16(__a, __b) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \ + (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7))) +#define msa_qpacku_s32(__a, __b) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \ + (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15))) +#define msa_qpacku_s64(__a, __b) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \ + (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31))) + +/* packr */ +#define msa_packr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c)))) +#define msa_packr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c)))) +#define msa_packr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c)))) +#define msa_packr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c)))) +#define msa_packr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c)))) +#define msa_packr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c)))) + +/* rpackr */ +#define msa_rpackr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c)))) +#define msa_rpackr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c)))) +#define msa_rpackr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c)))) +#define msa_rpackr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)))) +#define msa_rpackr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)))) +#define msa_rpackr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)))) + +/* qrpackr */ +#define msa_qrpackr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \ + (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7))) +#define msa_qrpackr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \ + (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15))) +#define msa_qrpackr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \ + (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31))) +#define msa_qrpackr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \ + (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7))) +#define msa_qrpackr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \ + (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15))) +#define msa_qrpackr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \ + (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31))) + +/* qrpackru */ +#define msa_qrpackru_s16(__a, __b, __c) \ +({ \ + v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \ + v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \ + (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ +}) + +#define msa_qrpackru_s32(__a, __b, __c) \ +({ \ + v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \ + v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \ + (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ +}) + +#define msa_qrpackru_s64(__a, __b, __c) \ +({ \ + v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \ + v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \ + (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ +}) + +/* Minimum values between corresponding elements in the two vectors are written to teh returned vector. */ +#define msa_minq_s8(__a, __b) (__builtin_msa_min_s_b(__a, __b)) +#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b)) +#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b)) +#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b)) +#define msa_minq_u8(__a, __b) ((v16u8)__builtin_msa_min_u_b(__a, __b)) +#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b)) +#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b)) +#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b)) +#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b)) +#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b)) + +/* Maximum values between corresponding elements in the two vectors are written to teh returned vector. */ +#define msa_maxq_s8(__a, __b) (__builtin_msa_max_s_b(__a, __b)) +#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b)) +#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b)) +#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b)) +#define msa_maxq_u8(__a, __b) ((v16u8)__builtin_msa_max_u_b(__a, __b)) +#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b)) +#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b)) +#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b)) +#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b)) +#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b)) + +/* Vector type reinterpretion */ +#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec)) + +/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */ +/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */ +#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b))) +/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */ +#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b))) +/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */ +#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b))) + +/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */ +#define msa_pckev_s8(__a, __b) (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b))) +#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b))) +#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b))) +#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b))) + +/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */ +#define msa_pckod_s8(__a, __b) (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b))) +#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b))) +#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b))) +#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b))) + +#ifdef _MIPSEB +#define LANE_IMM0_1(x) (0b1 - ((x) & 0b1)) +#define LANE_IMM0_3(x) (0b11 - ((x) & 0b11)) +#define LANE_IMM0_7(x) (0b111 - ((x) & 0b111)) +#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111)) +#else +#define LANE_IMM0_1(x) ((x) & 0b1) +#define LANE_IMM0_3(x) ((x) & 0b11) +#define LANE_IMM0_7(x) ((x) & 0b111) +#define LANE_IMM0_15(x) ((x) & 0b1111) +#endif + +#define msa_get_lane_u8(__a, __b) ((uint8_t)(__a)[LANE_IMM0_7(__b)]) +#define msa_get_lane_s8(__a, __b) ((int8_t)(__a)[LANE_IMM0_7(__b)]) +#define msa_get_lane_u16(__a, __b) ((uint16_t)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_s16(__a, __b) ((int16_t)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_u32(__a, __b) ((uint32_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_s32(__a, __b) ((int32_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_s64(__a, __b) ((int64_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_u64(__a, __b) ((uint64_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)]) +#define msa_getq_lane_u8(__a, imm0_15) ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15)) +#define msa_getq_lane_s8(__a, imm0_15) ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15)) +#define msa_getq_lane_u16(__a, imm0_7) ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7)) +#define msa_getq_lane_s16(__a, imm0_7) ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7)) +#define msa_getq_lane_u32(__a, imm0_3) __builtin_msa_copy_u_w((v4i32)(__a), imm0_3) +#define msa_getq_lane_s32 __builtin_msa_copy_s_w +#define msa_getq_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)]) +#define msa_getq_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)]) +#if (__mips == 64) +#define msa_getq_lane_u64(__a, imm0_1) __builtin_msa_copy_u_d((v2i64)(__a), imm0_1) +#define msa_getq_lane_s64 __builtin_msa_copy_s_d +#else +#define msa_getq_lane_u64(__a, imm0_1) ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)]) +#define msa_getq_lane_s64(__a, imm0_1) ((int64_t)(__a)[LANE_IMM0_1(imm0_1)]) +#endif + +/* combine */ +#if (__mips == 64) +#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]})) +#else +#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1], \ + ((v2u32)(b))[0], ((v2u32)(b))[1]})) +#endif + +/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */ +#define msa_combine_s8(__a, __b) __COMBINE_64_64(v16i8, __a, __b) + +/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */ +#define msa_combine_s16(__a, __b) __COMBINE_64_64(v8i16, __a, __b) + +/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */ +#define msa_combine_s32(__a, __b) __COMBINE_64_64(v4i32, __a, __b) + +/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */ +#define msa_combine_s64(__a, __b) __COMBINE_64_64(v2i64, __a, __b) + +/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */ +#define msa_combine_f32(__a, __b) __COMBINE_64_64(v4f32, __a, __b) + +/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */ +#define msa_combine_u8(__a, __b) __COMBINE_64_64(v16u8, __a, __b) + +/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */ +#define msa_combine_u16(__a, __b) __COMBINE_64_64(v8u16, __a, __b) + +/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */ +#define msa_combine_u32(__a, __b) __COMBINE_64_64(v4u32, __a, __b) + +/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */ +#define msa_combine_u64(__a, __b) __COMBINE_64_64(v2u64, __a, __b) + +/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */ +#define msa_combine_f64(__a, __b) __COMBINE_64_64(v2f64, __a, __b) + +/* get_low, get_high */ +#if (__mips == 64) +#define __GET_LOW(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0)))) +#define __GET_HIGH(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1)))) +#else +#define __GET_LOW(__TYPE, a) ((__TYPE)(((v2u64)(a))[0])) +#define __GET_HIGH(__TYPE, a) ((__TYPE)(((v2u64)(a))[1])) +#endif + +/* v8i8 msa_get_low_s8(v16i8 __a) */ +#define msa_get_low_s8(__a) __GET_LOW(v8i8, __a) + +/* v4i16 msa_get_low_s16(v8i16 __a) */ +#define msa_get_low_s16(__a) __GET_LOW(v4i16, __a) + +/* v2i32 msa_get_low_s32(v4i32 __a) */ +#define msa_get_low_s32(__a) __GET_LOW(v2i32, __a) + +/* v1i64 msa_get_low_s64(v2i64 __a) */ +#define msa_get_low_s64(__a) __GET_LOW(v1i64, __a) + +/* v8u8 msa_get_low_u8(v16u8 __a) */ +#define msa_get_low_u8(__a) __GET_LOW(v8u8, __a) + +/* v4u16 msa_get_low_u16(v8u16 __a) */ +#define msa_get_low_u16(__a) __GET_LOW(v4u16, __a) + +/* v2u32 msa_get_low_u32(v4u32 __a) */ +#define msa_get_low_u32(__a) __GET_LOW(v2u32, __a) + +/* v1u64 msa_get_low_u64(v2u64 __a) */ +#define msa_get_low_u64(__a) __GET_LOW(v1u64, __a) + +/* v2f32 msa_get_low_f32(v4f32 __a) */ +#define msa_get_low_f32(__a) __GET_LOW(v2f32, __a) + +/* v1f64 msa_get_low_f64(v2f64 __a) */ +#define msa_get_low_f64(__a) __GET_LOW(v1f64, __a) + +/* v8i8 msa_get_high_s8(v16i8 __a) */ +#define msa_get_high_s8(__a) __GET_HIGH(v8i8, __a) + +/* v4i16 msa_get_high_s16(v8i16 __a) */ +#define msa_get_high_s16(__a) __GET_HIGH(v4i16, __a) + +/* v2i32 msa_get_high_s32(v4i32 __a) */ +#define msa_get_high_s32(__a) __GET_HIGH(v2i32, __a) + +/* v1i64 msa_get_high_s64(v2i64 __a) */ +#define msa_get_high_s64(__a) __GET_HIGH(v1i64, __a) + +/* v8u8 msa_get_high_u8(v16u8 __a) */ +#define msa_get_high_u8(__a) __GET_HIGH(v8u8, __a) + +/* v4u16 msa_get_high_u16(v8u16 __a) */ +#define msa_get_high_u16(__a) __GET_HIGH(v4u16, __a) + +/* v2u32 msa_get_high_u32(v4u32 __a) */ +#define msa_get_high_u32(__a) __GET_HIGH(v2u32, __a) + +/* v1u64 msa_get_high_u64(v2u64 __a) */ +#define msa_get_high_u64(__a) __GET_HIGH(v1u64, __a) + +/* v2f32 msa_get_high_f32(v4f32 __a) */ +#define msa_get_high_f32(__a) __GET_HIGH(v2f32, __a) + +/* v1f64 msa_get_high_f64(v2f64 __a) */ +#define msa_get_high_f64(__a) __GET_HIGH(v1f64, __a) + +/* ri = ai * b[lane] */ +/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */ +#define msa_mulq_lane_f32(__a, __b, __lane) ((__a) * msa_getq_lane_f32(__b, __lane)) + +/* ri = ai + bi * c[lane] */ +/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */ +#define msa_mlaq_lane_f32(__a, __b, __c, __lane) ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane))) + +/* uint16_t msa_sum_u16(v8u16 __a)*/ +#define msa_sum_u16(__a) \ +({ \ + v4u32 _b; \ + v2u64 _c; \ + _b = __builtin_msa_hadd_u_w(__a, __a); \ + _c = __builtin_msa_hadd_u_d(_b, _b); \ + (uint16_t)(_c[0] + _c[1]); \ +}) + +/* int16_t msa_sum_s16(v8i16 __a) */ +#define msa_sum_s16(__a) \ +({ \ + v4i32 _b; \ + v2i64 _c; \ + _b = __builtin_msa_hadd_s_w(__a, __a); \ + _c = __builtin_msa_hadd_s_d(_b, _b); \ + (int16_t)(_c[0] + _c[1]); \ +}) + + +/* uint32_t msa_sum_u32(v4u32 __a)*/ +#define msa_sum_u32(__a) \ +({ \ + v2u64 _b; \ + _b = __builtin_msa_hadd_u_d(__a, __a); \ + (uint32_t)(_b[0] + _b[1]); \ +}) + +/* int32_t msa_sum_s32(v4i32 __a)*/ +#define msa_sum_s32(__a) \ +({ \ + v2i64 _b; \ + _b = __builtin_msa_hadd_s_d(__a, __a); \ + (int32_t)(_b[0] + _b[1]); \ +}) + +/* uint8_t msa_sum_u8(v16u8 __a)*/ +#define msa_sum_u8(__a) \ +({ \ + v8u16 _b16; \ + v4u32 _c32; \ + _b16 = __builtin_msa_hadd_u_h(__a, __a); \ + _c32 = __builtin_msa_hadd_u_w(_b16, _b16); \ + (uint8_t)msa_sum_u32(_c32); \ +}) + +/* int8_t msa_sum_s8(v16s8 __a)*/ +#define msa_sum_s8(__a) \ +({ \ + v8i16 _b16; \ + v4i32 _c32; \ + _b16 = __builtin_msa_hadd_s_h(__a, __a); \ + _c32 = __builtin_msa_hadd_s_w(_b16, _b16); \ + (int8_t)msa_sum_s32(_c32); \ +}) + +/* float msa_sum_f32(v4f32 __a)*/ +#define msa_sum_f32(__a) ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3]) + +/* v8u16 msa_paddlq_u8(v16u8 __a) */ +#define msa_paddlq_u8(__a) (__builtin_msa_hadd_u_h(__a, __a)) + +/* v8i16 msa_paddlq_s8(v16i8 __a) */ +#define msa_paddlq_s8(__a) (__builtin_msa_hadd_s_h(__a, __a)) + +/* v4u32 msa_paddlq_u16 (v8u16 __a)*/ +#define msa_paddlq_u16(__a) (__builtin_msa_hadd_u_w(__a, __a)) + +/* v4i32 msa_paddlq_s16 (v8i16 __a)*/ +#define msa_paddlq_s16(__a) (__builtin_msa_hadd_s_w(__a, __a)) + +/* v2u64 msa_paddlq_u32(v4u32 __a) */ +#define msa_paddlq_u32(__a) (__builtin_msa_hadd_u_d(__a, __a)) + +/* v2i64 msa_paddlq_s32(v4i32 __a) */ +#define msa_paddlq_s32(__a) (__builtin_msa_hadd_s_d(__a, __a)) + +#define V8U8_2_V8U16(x) {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \ + (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]} +#define V8U8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \ + (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]} +#define V8I8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \ + (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]} +#define V4U16_2_V4U32(x) {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]} +#define V4U16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]} +#define V4I16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]} +#define V2U32_2_V2U64(x) {(uint64_t)x[0], (uint64_t)x[1]} +#define V2U32_2_V2I64(x) {(int64_t)x[0], (int64_t)x[1]} + +/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */ +#define msa_mull_u8(__a, __b) ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b))) + +/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/ +#define msa_mull_s8(__a, __b) (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b))) + +/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */ +#define msa_mull_u16(__a, __b) ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b))) + +/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */ +#define msa_mull_s16(__a, __b) (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b))) + +/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */ +#define msa_mull_u32(__a, __b) ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b))) + +/* bitwise and: __builtin_msa_and_v */ +#define msa_andq_u8(__a, __b) ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s8(__a, __b) ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise or: __builtin_msa_or_v */ +#define msa_orrq_u8(__a, __b) ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s8(__a, __b) ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise xor: __builtin_msa_xor_v */ +#define msa_eorq_u8(__a, __b) ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s8(__a, __b) ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */ +#define msa_mvnq_u8(__a) ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s8(__a) ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) + +/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */ +#define msa_ceqq_u8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b))) +#define msa_ceqq_s8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b))) +#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b))) +#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b))) +#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b))) +#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b))) +#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b))) +#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b))) +#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b))) +#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b))) + +/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */ +#define msa_cltq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b))) +#define msa_cltq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b))) +#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b))) +#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b))) +#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b))) +#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b))) +#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b))) +#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b))) +#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b))) +#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b))) + +/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */ +#define msa_cgtq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a))) +#define msa_cgtq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a))) +#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a))) +#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a))) +#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a))) +#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a))) +#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a))) +#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a))) +#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a))) +#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a))) + +/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */ +#define msa_cleq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b))) +#define msa_cleq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b))) +#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b))) +#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b))) +#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b))) +#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b))) +#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b))) +#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b))) +#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b))) +#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b))) + +/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */ +#define msa_cgeq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a))) +#define msa_cgeq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a))) +#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a))) +#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a))) +#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a))) +#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a))) +#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a))) +#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a))) +#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a))) +#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a))) + +/* Shift Left Logical: shl -> ri = ai << bi; */ +#define msa_shlq_u8(__a, __b) ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shlq_s8(__a, __b) ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b))) +#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b))) + +/* Immediate Shift Left Logical: shl -> ri = ai << imm; */ +#define msa_shlq_n_u8(__a, __imm) ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm)) +#define msa_shlq_n_s8(__a, __imm) ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm)) +#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm)) +#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm)) +#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm)) +#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm)) +#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm)) +#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm)) + +/* shift right: shrq -> ri = ai >> bi; */ +#define msa_shrq_u8(__a, __b) ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shrq_s8(__a, __b) ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b))) +#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b))) + +/* Immediate Shift Right: shr -> ri = ai >> imm; */ +#define msa_shrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm)) +#define msa_shrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm)) +#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm)) +#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm)) +#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm)) +#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm)) +#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm)) +#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm)) + +/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */ +#define msa_rshrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm)) +#define msa_rshrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm)) +#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm)) +#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm)) +#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm)) +#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm)) +#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm)) +#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm)) + +/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */ +#define msa_qrshrq_s32(a, b) ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b))) + +/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */ +#define msa_qaddq_u8 __builtin_msa_adds_u_b +#define msa_qaddq_s8 __builtin_msa_adds_s_b +#define msa_qaddq_u16 __builtin_msa_adds_u_h +#define msa_qaddq_s16 __builtin_msa_adds_s_h +#define msa_qaddq_u32 __builtin_msa_adds_u_w +#define msa_qaddq_s32 __builtin_msa_adds_s_w +#define msa_qaddq_u64 __builtin_msa_adds_u_d +#define msa_qaddq_s64 __builtin_msa_adds_s_d +#define msa_addq_u8(a, b) ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b))) +#define msa_addq_s8 __builtin_msa_addv_b +#define msa_addq_u16(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b))) +#define msa_addq_s16 __builtin_msa_addv_h +#define msa_addq_u32(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b))) +#define msa_addq_s32 __builtin_msa_addv_w +#define msa_addq_f32 __builtin_msa_fadd_w +#define msa_addq_u64(a, b) ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b))) +#define msa_addq_s64 __builtin_msa_addv_d +#define msa_addq_f64 __builtin_msa_fadd_d +#define msa_qsubq_u8 __builtin_msa_subs_u_b +#define msa_qsubq_s8 __builtin_msa_subs_s_b +#define msa_qsubq_u16 __builtin_msa_subs_u_h +#define msa_qsubq_s16 __builtin_msa_subs_s_h +#define msa_subq_u8(a, b) ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b))) +#define msa_subq_s8 __builtin_msa_subv_b +#define msa_subq_u16(a, b) ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b))) +#define msa_subq_s16 __builtin_msa_subv_h +#define msa_subq_u32(a, b) ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b))) +#define msa_subq_s32 __builtin_msa_subv_w +#define msa_subq_f32 __builtin_msa_fsub_w +#define msa_subq_u64(a, b) ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b))) +#define msa_subq_s64 __builtin_msa_subv_d +#define msa_subq_f64 __builtin_msa_fsub_d +#define msa_mulq_u8(a, b) ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b))) +#define msa_mulq_s8(a, b) ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b))) +#define msa_mulq_u16(a, b) ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b))) +#define msa_mulq_s16(a, b) ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b))) +#define msa_mulq_u32(a, b) ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b))) +#define msa_mulq_s32(a, b) ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b))) +#define msa_mulq_u64(a, b) ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b))) +#define msa_mulq_s64(a, b) ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b))) +#define msa_mulq_f32 __builtin_msa_fmul_w +#define msa_mulq_f64 __builtin_msa_fmul_d +#define msa_divq_f32 __builtin_msa_fdiv_w +#define msa_divq_f64 __builtin_msa_fdiv_d +#define msa_dotp_s_h __builtin_msa_dotp_s_h +#define msa_dotp_s_w __builtin_msa_dotp_s_w +#define msa_dotp_s_d __builtin_msa_dotp_s_d +#define msa_dotp_u_h __builtin_msa_dotp_u_h +#define msa_dotp_u_w __builtin_msa_dotp_u_w +#define msa_dotp_u_d __builtin_msa_dotp_u_d +#define msa_dpadd_s_h __builtin_msa_dpadd_s_h +#define msa_dpadd_s_w __builtin_msa_dpadd_s_w +#define msa_dpadd_s_d __builtin_msa_dpadd_s_d +#define msa_dpadd_u_h __builtin_msa_dpadd_u_h +#define msa_dpadd_u_w __builtin_msa_dpadd_u_w +#define msa_dpadd_u_d __builtin_msa_dpadd_u_d + +#define ILVRL_B2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1)); \ + } while (0) +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) +#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1)); \ + } while (0) +#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1)); \ + } while (0) +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__) + +/* absq, qabsq (r = |a|;) */ +#define msa_absq_s8(a) __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0)) +#define msa_absq_s16(a) __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0)) +#define msa_absq_s32(a) __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0)) +#define msa_absq_s64(a) __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0)) +#define msa_absq_f32(a) ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31)) +#define msa_absq_f64(a) ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63)) +#define msa_qabsq_s8(a) __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0)) +#define msa_qabsq_s16(a) __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0)) +#define msa_qabsq_s32(a) __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0)) +#define msa_qabsq_s64(a) __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0)) + +/* abdq, qabdq (r = |a - b|;) */ +#define msa_abdq_u8 __builtin_msa_asub_u_b +#define msa_abdq_s8 __builtin_msa_asub_s_b +#define msa_abdq_u16 __builtin_msa_asub_u_h +#define msa_abdq_s16 __builtin_msa_asub_s_h +#define msa_abdq_u32 __builtin_msa_asub_u_w +#define msa_abdq_s32 __builtin_msa_asub_s_w +#define msa_abdq_u64 __builtin_msa_asub_u_d +#define msa_abdq_s64 __builtin_msa_asub_s_d +#define msa_abdq_f32(a, b) msa_absq_f32(__builtin_msa_fsub_w(a, b)) +#define msa_abdq_f64(a, b) msa_absq_f64(__builtin_msa_fsub_d(a, b)) +#define msa_qabdq_s8(a, b) msa_qabsq_s8(__builtin_msa_subs_s_b(a, b)) +#define msa_qabdq_s16(a, b) msa_qabsq_s16(__builtin_msa_subs_s_h(a, b)) +#define msa_qabdq_s32(a, b) msa_qabsq_s32(__builtin_msa_subs_s_w(a, b)) +#define msa_qabdq_s64(a, b) msa_qabsq_s64(__builtin_msa_subs_s_d(a, b)) + +/* sqrtq, rsqrtq */ +#define msa_sqrtq_f32 __builtin_msa_fsqrt_w +#define msa_sqrtq_f64 __builtin_msa_fsqrt_d +#define msa_rsqrtq_f32 __builtin_msa_frsqrt_w +#define msa_rsqrtq_f64 __builtin_msa_frsqrt_d + + +/* mlaq: r = a + b * c; */ +__extension__ extern __inline v4i32 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c) +{ + __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v2i64 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c) +{ + __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v4f32 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c) +{ + __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v2f64 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c) +{ + __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +/* cntq */ +#define msa_cntq_s8 __builtin_msa_pcnt_b +#define msa_cntq_s16 __builtin_msa_pcnt_h +#define msa_cntq_s32 __builtin_msa_pcnt_w +#define msa_cntq_s64 __builtin_msa_pcnt_d + +/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */ +#define msa_bslq_u8 __builtin_msa_bsel_v + +/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */ +#define msa_ilvrq_s8 __builtin_msa_ilvr_b +#define msa_ilvrq_s16 __builtin_msa_ilvr_h +#define msa_ilvrq_s32 __builtin_msa_ilvr_w +#define msa_ilvrq_s64 __builtin_msa_ilvr_d +#define msa_ilvlq_s8 __builtin_msa_ilvl_b +#define msa_ilvlq_s16 __builtin_msa_ilvl_h +#define msa_ilvlq_s32 __builtin_msa_ilvl_w +#define msa_ilvlq_s64 __builtin_msa_ilvl_d + +/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */ +#define msa_ilvevq_s8 __builtin_msa_ilvev_b +#define msa_ilvevq_s16 __builtin_msa_ilvev_h +#define msa_ilvevq_s32 __builtin_msa_ilvev_w +#define msa_ilvevq_s64 __builtin_msa_ilvev_d +#define msa_ilvodq_s8 __builtin_msa_ilvod_b +#define msa_ilvodq_s16 __builtin_msa_ilvod_h +#define msa_ilvodq_s32 __builtin_msa_ilvod_w +#define msa_ilvodq_s64 __builtin_msa_ilvod_d + +/* extq (r = (a || b); a concatenation b and get elements from index c) */ +#ifdef _MIPSEB +#define msa_extq_s8(a, b, c) \ +(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b)) +#define msa_extq_s16(a, b, c) \ +(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b)) +#define msa_extq_s32(a, b, c) \ +(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b)) +#define msa_extq_s64(a, b, c) \ +(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b)) +#else +#define msa_extq_s8(a, b, c) \ +(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a)) +#define msa_extq_s16(a, b, c) \ +(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a)) +#define msa_extq_s32(a, b, c) \ +(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a)) +#define msa_extq_s64(a, b, c) \ +(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a)) +#endif /* _MIPSEB */ + +/* cvttruncq, cvttintq, cvtrintq */ +#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w +#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w +#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d +#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d +#define msa_cvttintq_u32_f32 __builtin_msa_ftint_u_w +#define msa_cvttintq_s32_f32 __builtin_msa_ftint_s_w +#define msa_cvttintq_u64_f64 __builtin_msa_ftint_u_d +#define msa_cvttintq_s64_f64 __builtin_msa_ftint_s_d +#define msa_cvtrintq_f32 __builtin_msa_frint_w +#define msa_cvtrintq_f64 __builtin_msa_frint_d + +/* cvtfintq, cvtfq */ +#define msa_cvtfintq_f32_u32 __builtin_msa_ffint_u_w +#define msa_cvtfintq_f32_s32 __builtin_msa_ffint_s_w +#define msa_cvtfintq_f64_u64 __builtin_msa_ffint_u_d +#define msa_cvtfintq_f64_s64 __builtin_msa_ffint_s_d +#define msa_cvtfq_f32_f64 __builtin_msa_fexdo_w +#define msa_cvtflq_f64_f32 __builtin_msa_fexupr_d +#define msa_cvtfhq_f64_f32 __builtin_msa_fexupl_d + +#define msa_addl_u8(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b))) +#define msa_addl_s8(a, b) (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b))) +#define msa_addl_u16(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b))) +#define msa_addl_s16(a, b) (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b))) +#define msa_subl_s16(a, b) (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b))) +#define msa_recpeq_f32 __builtin_msa_frcp_w +#define msa_recpsq_f32(a, b) (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b))) + +#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \ + *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \ +{ \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \ + _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \ + *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \ + *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \ + *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ +} +#else +#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \ + _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \ +} +#endif + +MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8) +MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \ + _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ +} +#else +#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \ + _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \ +} +#endif + +MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16) +MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16) + +#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v00 = msa_ld1q_##suffix(ptr); \ + _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \ + _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \ + _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \ + _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \ + _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \ + *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \ + *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \ + *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \ +} + +MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32) +MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32) +MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32) + +#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + *((_Tp*)a) = *ptr; *((_Tp*)b) = *(ptr + 1); *((_Tp*)c) = *(ptr + 2); \ + *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \ +} + +MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64) +MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64) +MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8) +MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16) +MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32) +MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32) +MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32) + +#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + *ptr = a[0]; *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \ + *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \ +} + +MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64) +MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64) +MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64) + +#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \ + _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \ + _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \ + _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \ + _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \ + _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \ + *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \ + *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \ + *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \ + *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \ +{ \ + _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \ + _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \ + _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \ + msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \ + msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \ + msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4) + +#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \ + _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \ + *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \ + *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \ +{ \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64) + +__extension__ extern __inline v8i16 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_qdmulhq_n_s16(v8i16 a, int16_t b) +{ + v8i16 a_lo, a_hi; + ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi); + return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1), + msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /*__mips_msa*/ +#endif /* OPENCV_CORE_MSA_MACROS_H */ diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index 0c1b33a..cd41237 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -409,13 +409,13 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, int x = 0; #if CV_SIMD - #if !CV_NEON + #if !CV_NEON && !CV_MSA if (is_aligned(src1, src2, dst)) { for (; x <= width - wide_step_l; x += wide_step_l) { ldr::la(src1 + x, src2 + x, dst + x); - #if !CV_NEON && CV_SIMD_WIDTH == 16 + #if CV_SIMD_WIDTH == 16 ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step); #endif } diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index fcc88a1..5d18802 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -2476,6 +2476,45 @@ double dotProd_8s(const schar* src1, const schar* src2, int len) i += blockSize; } } +#elif CV_MSA + int len0 = len & -8, blockSize0 = (1 << 14), blockSize; + v4i32 v_zero = msa_dupq_n_s32(0); + CV_DECL_ALIGNED(16) int buf[4]; + + while( i < len0 ) + { + blockSize = std::min(len0 - i, blockSize0); + v4i32 v_sum = v_zero; + + int j = 0; + for( ; j <= blockSize - 16; j += 16 ) + { + v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j); + + v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2)); + v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20)); + v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20)); + + v_src10 = msa_movl_s8(msa_get_high_s8(v_src1)); + v_src20 = msa_movl_s8(msa_get_high_s8(v_src2)); + v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20)); + v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20)); + } + + for( ; j <= blockSize - 8; j += 8 ) + { + v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j)); + v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2)); + v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2)); + } + + msa_st1q_s32(buf, v_sum); + r += buf[0] + buf[1] + buf[2] + buf[3]; + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } #endif return r + dotProd_(src1, src2, len - i); diff --git a/modules/core/src/parallel_impl.cpp b/modules/core/src/parallel_impl.cpp index bc64fce..adf2bd3 100644 --- a/modules/core/src/parallel_impl.cpp +++ b/modules/core/src/parallel_impl.cpp @@ -59,6 +59,8 @@ DECLARE_CV_PAUSE # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0) # elif defined __GNUC__ && defined __arm__ # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0) +# elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2 +# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0) # elif defined __GNUC__ && defined __PPC64__ # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0) # else diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index f723a8c..8d7ccf7 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -368,6 +368,8 @@ struct HWFeatures g_hwFeatureNames[CPU_VSX] = "VSX"; g_hwFeatureNames[CPU_VSX3] = "VSX3"; + g_hwFeatureNames[CPU_MSA] = "CPU_MSA"; + g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX"; g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL"; g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM"; @@ -557,6 +559,9 @@ struct HWFeatures #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800) have[CV_CPU_NEON] = true; #endif + #ifdef __mips_msa + have[CV_CPU_MSA] = true; + #endif // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs have[CV_CPU_VSX] = (CV_VSX); // TODO: Check VSX3 availability in runtime for other platforms diff --git a/platforms/linux/mips.toolchain.cmake b/platforms/linux/mips.toolchain.cmake new file mode 100755 index 0000000..425d5df --- /dev/null +++ b/platforms/linux/mips.toolchain.cmake @@ -0,0 +1,80 @@ +# ---------------------------------------------------------------------------------------------- +# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ . +# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets. +# Toolchains with 'img' in the name are for MIPS R6 instruction sets. +# It is recommended to use cmake-gui application for build scripts configuration and generation: +# 1. Run cmake-gui +# 2. Specifiy toolchain file for cross-compiling, mips32r5el-gnu.toolchian.cmake or mips64r6el-gnu.toolchain.cmake +# can be selected. +# 3. Configure and Generate makefiles. +# 4. make -j4 & make install +# ---------------------------------------------------------------------------------------------- + +if(COMMAND toolchain_save_config) + return() # prevent recursive call +endif() + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_VERSION 1) +if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR) + set(CMAKE_SYSTEM_PROCESSOR mips) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake") + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL mips AND NOT MIPS_IGNORE_FP) + set(FLOAT_ABI_SUFFIX "") +endif() + +if(NOT "x${GCC_COMPILER_VERSION}" STREQUAL "x") + set(__GCC_VER_SUFFIX "-${GCC_COMPILER_VERSION}") +endif() + +if(NOT DEFINED CMAKE_C_COMPILER) + find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-gcc${__GCC_VER_SUFFIX}) +endif() +if(NOT DEFINED CMAKE_CXX_COMPILER) + find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-g++${__GCC_VER_SUFFIX}) +endif() +if(NOT DEFINED CMAKE_LINKER) + find_program(CMAKE_LINKER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld) +endif() +if(NOT DEFINED CMAKE_AR) + find_program(CMAKE_AR NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar) +endif() + +if(NOT DEFINED MIPS_LINUX_SYSROOT AND DEFINED GNU_MACHINE) + set(MIPS_LINUX_SYSROOT /usr/bin) +endif() + +if(NOT DEFINED CMAKE_CXX_FLAGS) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "mips32r5el") + set(CMAKE_C_FLAGS "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "") + set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "") + set(CMAKE_CXX_FLAGS "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "") + set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "") + set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -ldl -latomic" CACHE INTERNAL "Added for mips cross build error") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64r6el") + set(CMAKE_C_FLAGS "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "") + set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "") + set(CMAKE_CXX_FLAGS "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "") + set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "") + set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -ldl" CACHE INTERNAL "Added for mips cross build error") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") + endif() + set(CMAKE_SHARED_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_MODULE_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}") +endif() + +set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${MIPS_LINUX_SYSROOT}) + +set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS} + MIPS_LINUX_SYSROOT +) +toolchain_save_config() diff --git a/platforms/linux/mips32r5el-gnu.toolchain.cmake b/platforms/linux/mips32r5el-gnu.toolchain.cmake new file mode 100755 index 0000000..1937270 --- /dev/null +++ b/platforms/linux/mips32r5el-gnu.toolchain.cmake @@ -0,0 +1,14 @@ +# ---------------------------------------------------------------------------------------------- +# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ . +# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets. +# Toolchains with 'img' in the name are for MIPS R6 instruction sets. +# It is recommended to use cmake-gui for build scripts configuration and generation: +# 1. Run cmake-gui +# 2. Specifiy toolchain file mips32r5el-gnu.toolchian.cmake for cross-compiling. +# 3. Configure and Generate makefiles. +# 4. make -j4 & make install +# ---------------------------------------------------------------------------------------------- +set(CMAKE_SYSTEM_PROCESSOR mips32r5el) +set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version") +set(GNU_MACHINE "mips-mti-linux-gnu" CACHE STRING "GNU compiler triple") +include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake") diff --git a/platforms/linux/mips64r6el-gnu.toolchain.cmake b/platforms/linux/mips64r6el-gnu.toolchain.cmake new file mode 100755 index 0000000..b022240 --- /dev/null +++ b/platforms/linux/mips64r6el-gnu.toolchain.cmake @@ -0,0 +1,14 @@ +# ---------------------------------------------------------------------------------------------- +# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ . +# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets. +# Toolchains with 'img' in the name are for MIPS R6 instruction sets. +# It is recommended to use cmake-gui for build scripts configuration and generation: +# 1. Run cmake-gui +# 2. Specifiy toolchain file mips64r6el-gnu.toolchain.cmake for cross-compiling. +# 3. Configure and Generate makefiles. +# 4. make -j4 & make install +# ---------------------------------------------------------------------------------------------- +set(CMAKE_SYSTEM_PROCESSOR mips64r6el) +set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version") +set(GNU_MACHINE "mips-img-linux-gnu" CACHE STRING "GNU compiler triple") +include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake")