* Added MSA implementations for mips platforms. Intrinsics for MSA and build scripts for MIPS platforms are added.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* Removed some unused code in mips.toolchain.cmake.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* Added comments for mips toolchain configuration and disabled compiling warnings for libpng.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* Fixed the build error of unsupported opcode 'pause' when mips isa_rev is less than 2.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* 1. Removed FP16 related item in MSA option defines in OpenCVCompilerOptimizations.cmake.
2. Use CV_CPU_COMPILE_MSA instead of __mips_msa for MSA feature check in cv_cpu_dispatch.h.
3. Removed hasSIMD128() in intrin_msa.hpp.
4. Define CPU_MSA as 150.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* 1. Removed unnecessary CV_SIMD128_64F guarding in intrin_msa.hpp.
2. Removed unnecessary CV_MSA related code block in dotProd_8u().
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* 1. Defined CPU_MSA_FLAGS_ON as "-mmsa".
2. Removed CV_SIMD128_64F guardings in intrin_msa.hpp.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
* Removed unused msa_mlal_u16() and msa_mlal_s16 from msa_macros.h.
Signed-off-by: Fei Wu <fwu@wavecomp.com>
add_definitions(-DPNG_INTEL_SSE)
endif()
+# set definitions and sources for MIPS
+if(";${CPU_BASELINE_FINAL};" MATCHES "MSA")
+ list(APPEND lib_srcs mips/mips_init.c mips/filter_msa_intrinsics.c)
+ add_definitions(-DPNG_MIPS_MSA_OPT=2)
+ ocv_warnings_disable(CMAKE_C_FLAGS -Wshadow)
+else()
+ add_definitions(-DPNG_MIPS_MSA_OPT=0)
+endif()
+
if(PPC64LE OR PPC64)
# VSX3 features are backwards compatible
if(";${CPU_BASELINE_FINAL};" MATCHES "VSX.*"
--- /dev/null
+
+/* filter_msa_intrinsics.c - MSA optimised filter functions
+ *
+ * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2016 Glenn Randers-Pehrson
+ * Written by Mandar Sahastrabuddhe, August 2016.
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+
+/* This code requires -mfpu=msa on the command line: */
+#if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
+
+#include <msa.h>
+
+/* libpng row pointers are not necessarily aligned to any particular boundary,
+ * however this code will only work with appropriate alignment. mips/mips_init.c
+ * checks for this (and will not compile unless it is done). This code uses
+ * variants of png_aligncast to avoid compiler warnings.
+ */
+#define png_ptr(type,pointer) png_aligncast(type *,pointer)
+#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
+
+/* The following relies on a variable 'temp_pointer' being declared with type
+ * 'type'. This is written this way just to hide the GCC strict aliasing
+ * warning; note that the code is safe because there never is an alias between
+ * the input and output pointers.
+ */
+#define png_ldr(type,pointer)\
+ (temp_pointer = png_ptr(type,pointer), *temp_pointer)
+
+#if PNG_MIPS_MSA_OPT > 0
+
+#ifdef CLANG_BUILD
+ #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b)
+
+ #define LW(psrc) \
+ ( { \
+ uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
+ uint32_t val_m; \
+ \
+ asm volatile ( \
+ "lw %[val_m], %[psrc_lw_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_lw_m] "m" (*psrc_lw_m) \
+ ); \
+ \
+ val_m; \
+ } )
+
+ #define SH(val, pdst) \
+ { \
+ uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
+ uint16_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sh %[val_m], %[pdst_sh_m] \n\t" \
+ \
+ : [pdst_sh_m] "=m" (*pdst_sh_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #define SW(val, pdst) \
+ { \
+ uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
+ uint32_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sw %[val_m], %[pdst_sw_m] \n\t" \
+ \
+ : [pdst_sw_m] "=m" (*pdst_sw_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #if (__mips == 64)
+ #define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
+ uint64_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sd %[val_m], %[pdst_sd_m] \n\t" \
+ \
+ : [pdst_sd_m] "=m" (*pdst_sd_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+ #else
+ #define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
+ val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ }
+ #endif
+#else
+ #define MSA_SRLI_B(a, b) (a >> b)
+
+#if (__mips_isa_rev >= 6)
+ #define LW(psrc) \
+ ( { \
+ uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
+ uint32_t val_m; \
+ \
+ asm volatile ( \
+ "lw %[val_m], %[psrc_lw_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_lw_m] "m" (*psrc_lw_m) \
+ ); \
+ \
+ val_m; \
+ } )
+
+ #define SH(val, pdst) \
+ { \
+ uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
+ uint16_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sh %[val_m], %[pdst_sh_m] \n\t" \
+ \
+ : [pdst_sh_m] "=m" (*pdst_sh_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #define SW(val, pdst) \
+ { \
+ uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
+ uint32_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sw %[val_m], %[pdst_sw_m] \n\t" \
+ \
+ : [pdst_sw_m] "=m" (*pdst_sw_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #if (__mips == 64)
+ #define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
+ uint64_t val_m = (val); \
+ \
+ asm volatile ( \
+ "sd %[val_m], %[pdst_sd_m] \n\t" \
+ \
+ : [pdst_sd_m] "=m" (*pdst_sd_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+ #else
+ #define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
+ val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ }
+ #endif
+#else // !(__mips_isa_rev >= 6)
+ #define LW(psrc) \
+ ( { \
+ uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
+ uint32_t val_m; \
+ \
+ asm volatile ( \
+ "ulw %[val_m], %[psrc_lw_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_lw_m] "m" (*psrc_lw_m) \
+ ); \
+ \
+ val_m; \
+ } )
+
+ #define SH(val, pdst) \
+ { \
+ uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
+ uint16_t val_m = (val); \
+ \
+ asm volatile ( \
+ "ush %[val_m], %[pdst_sh_m] \n\t" \
+ \
+ : [pdst_sh_m] "=m" (*pdst_sh_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #define SW(val, pdst) \
+ { \
+ uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
+ uint32_t val_m = (val); \
+ \
+ asm volatile ( \
+ "usw %[val_m], %[pdst_sw_m] \n\t" \
+ \
+ : [pdst_sw_m] "=m" (*pdst_sw_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+ }
+
+ #define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
+ val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ }
+
+ #define SW_ZERO(pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *) (pdst); \
+ \
+ asm volatile ( \
+ "usw $0, %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : \
+ ); \
+ }
+#endif // (__mips_isa_rev >= 6)
+#endif
+
+#define LD_B(RTYPE, psrc) *((RTYPE *) (psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+{ \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+{ \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+{ \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+{ \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+}
+#define ADD3(in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ out2 = in4 + in5; \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+}
+
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
+ out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
+}
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
+ out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
+{ \
+ v16i8 zero_m = { 0 }; \
+ out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
+ out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
+}
+#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
+
+#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
+{ \
+ v16i8 zero_m = { 0 }; \
+ SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
+ out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
+}
+#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
+
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
+ out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
+}
+#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
+
+#define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \
+{ \
+ RTYPE zero = {0}; \
+ \
+ out0 = __msa_add_a_h((v8i16) zero, in0); \
+ out1 = __msa_add_a_h((v8i16) zero, in1); \
+ out2 = __msa_add_a_h((v8i16) zero, in2); \
+}
+#define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__)
+
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
+ out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+#define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \
+{ \
+ v8i16 _sel_h0, _sel_h1; \
+ v16u8 _sel_b0, _sel_b1; \
+ _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \
+ _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \
+ inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \
+ inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \
+ _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \
+ _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \
+ inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \
+ out0 += inp4; \
+}
+
+void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ size_t i, cnt, cnt16, cnt32;
+ size_t istop = row_info->rowbytes;
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (i = 0; i < (istop >> 6); i++)
+ {
+ LD_UB4(rp, 16, src0, src1, src2, src3);
+ LD_UB4(pp, 16, src4, src5, src6, src7);
+ pp += 64;
+
+ ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
+ src0, src1, src2, src3);
+
+ ST_UB4(src0, src1, src2, src3, rp, 16);
+ rp += 64;
+ }
+
+ if (istop & 0x3F)
+ {
+ cnt32 = istop & 0x20;
+ cnt16 = istop & 0x10;
+ cnt = istop & 0xF;
+
+ if(cnt32)
+ {
+ if (cnt16 && cnt)
+ {
+ LD_UB4(rp, 16, src0, src1, src2, src3);
+ LD_UB4(pp, 16, src4, src5, src6, src7);
+
+ ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
+ src0, src1, src2, src3);
+
+ ST_UB4(src0, src1, src2, src3, rp, 16);
+ rp += 64;
+ }
+ else if (cnt16 || cnt)
+ {
+ LD_UB2(rp, 16, src0, src1);
+ LD_UB2(pp, 16, src4, src5);
+ pp += 32;
+ src2 = LD_UB(rp + 32);
+ src6 = LD_UB(pp);
+
+ ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2);
+
+ ST_UB2(src0, src1, rp, 16);
+ rp += 32;
+ ST_UB(src2, rp);
+ rp += 16;
+ }
+ else
+ {
+ LD_UB2(rp, 16, src0, src1);
+ LD_UB2(pp, 16, src4, src5);
+
+ ADD2(src0, src4, src1, src5, src0, src1);
+
+ ST_UB2(src0, src1, rp, 16);
+ rp += 32;
+ }
+ }
+ else if (cnt16 && cnt)
+ {
+ LD_UB2(rp, 16, src0, src1);
+ LD_UB2(pp, 16, src4, src5);
+
+ ADD2(src0, src4, src1, src5, src0, src1);
+
+ ST_UB2(src0, src1, rp, 16);
+ rp += 32;
+ }
+ else if (cnt16 || cnt)
+ {
+ src0 = LD_UB(rp);
+ src4 = LD_UB(pp);
+ pp += 16;
+
+ src0 += src4;
+
+ ST_UB(src0, rp);
+ rp += 16;
+ }
+ }
+}
+
+void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ size_t count;
+ size_t istop = row_info->rowbytes;
+ png_bytep src = row;
+ png_bytep nxt = row + 4;
+ int32_t inp0;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1;
+ v16u8 zero = { 0 };
+
+ istop -= 4;
+
+ inp0 = LW(src);
+ src += 4;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+
+ for (count = 0; count < istop; count += 16)
+ {
+ src1 = LD_UB(src);
+ src += 16;
+
+ src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4);
+ src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8);
+ src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12);
+ src1 += src0;
+ src2 += src1;
+ src3 += src2;
+ src4 += src3;
+ src0 = src4;
+ ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1);
+ dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+
+ ST_UB(dst0, nxt);
+ nxt += 16;
+ }
+}
+
+void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ size_t count;
+ size_t istop = row_info->rowbytes;
+ png_bytep src = row;
+ png_bytep nxt = row + 3;
+ int64_t out0;
+ int32_t inp0, out1;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1;
+ v16u8 zero = { 0 };
+ v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
+
+ istop -= 3;
+
+ inp0 = LW(src);
+ src += 3;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+
+ for (count = 0; count < istop; count += 12)
+ {
+ src1 = LD_UB(src);
+ src += 12;
+
+ src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3);
+ src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6);
+ src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9);
+ src1 += src0;
+ src2 += src1;
+ src3 += src2;
+ src4 += src3;
+ src0 = src4;
+ VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1);
+ dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
+ out0 = __msa_copy_s_d((v2i64) dst0, 0);
+ out1 = __msa_copy_s_w((v4i32) dst0, 2);
+
+ SD(out0, nxt);
+ nxt += 8;
+ SW(out1, nxt);
+ nxt += 4;
+ }
+}
+
+void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ size_t i;
+ png_bytep src = row;
+ png_bytep nxt = row;
+ png_const_bytep pp = prev_row;
+ size_t istop = row_info->rowbytes - 4;
+ int32_t inp0, inp1, out0;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
+ v16u8 zero = { 0 };
+
+ inp0 = LW(pp);
+ pp += 4;
+ inp1 = LW(src);
+ src += 4;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+ src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
+ src0 = (v16u8) MSA_SRLI_B(src0, 1);
+ src1 += src0;
+ out0 = __msa_copy_s_w((v4i32) src1, 0);
+ SW(out0, nxt);
+ nxt += 4;
+
+ for (i = 0; i < istop; i += 16)
+ {
+ src2 = LD_UB(pp);
+ pp += 16;
+ src6 = LD_UB(src);
+ src += 16;
+
+ SLDI_B2_0_UB(src2, src6, src3, src7, 4);
+ SLDI_B2_0_UB(src2, src6, src4, src8, 8);
+ SLDI_B2_0_UB(src2, src6, src5, src9, 12);
+ src2 = __msa_ave_u_b(src2, src1);
+ src6 += src2;
+ src3 = __msa_ave_u_b(src3, src6);
+ src7 += src3;
+ src4 = __msa_ave_u_b(src4, src7);
+ src8 += src4;
+ src5 = __msa_ave_u_b(src5, src8);
+ src9 += src5;
+ src1 = src9;
+ ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1);
+ dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+
+ ST_UB(dst0, nxt);
+ nxt += 16;
+ }
+}
+
+void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ size_t i;
+ png_bytep src = row;
+ png_bytep nxt = row;
+ png_const_bytep pp = prev_row;
+ size_t istop = row_info->rowbytes - 3;
+ int64_t out0;
+ int32_t inp0, inp1, out1;
+ int16_t out2;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
+ v16u8 zero = { 0 };
+ v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
+
+ inp0 = LW(pp);
+ pp += 3;
+ inp1 = LW(src);
+ src += 3;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+ src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
+ src0 = (v16u8) MSA_SRLI_B(src0, 1);
+ src1 += src0;
+ out2 = __msa_copy_s_h((v8i16) src1, 0);
+ SH(out2, nxt);
+ nxt += 2;
+ nxt[0] = src1[2];
+ nxt++;
+
+ for (i = 0; i < istop; i += 12)
+ {
+ src2 = LD_UB(pp);
+ pp += 12;
+ src6 = LD_UB(src);
+ src += 12;
+
+ SLDI_B2_0_UB(src2, src6, src3, src7, 3);
+ SLDI_B2_0_UB(src2, src6, src4, src8, 6);
+ SLDI_B2_0_UB(src2, src6, src5, src9, 9);
+ src2 = __msa_ave_u_b(src2, src1);
+ src6 += src2;
+ src3 = __msa_ave_u_b(src3, src6);
+ src7 += src3;
+ src4 = __msa_ave_u_b(src4, src7);
+ src8 += src4;
+ src5 = __msa_ave_u_b(src5, src8);
+ src9 += src5;
+ src1 = src9;
+ VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1);
+ dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
+ out0 = __msa_copy_s_d((v2i64) dst0, 0);
+ out1 = __msa_copy_s_w((v4i32) dst0, 2);
+
+ SD(out0, nxt);
+ nxt += 8;
+ SW(out1, nxt);
+ nxt += 4;
+ }
+}
+
+void png_read_filter_row_paeth4_msa(png_row_infop row_info,
+ png_bytep row,
+ png_const_bytep prev_row)
+{
+ int32_t count, rp_end;
+ png_bytep nxt;
+ png_const_bytep prev_nxt;
+ int32_t inp0, inp1, res0;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 src10, src11, src12, src13, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v16u8 zero = { 0 };
+
+ nxt = row;
+ prev_nxt = prev_row;
+
+ inp0 = LW(nxt);
+ inp1 = LW(prev_nxt);
+ prev_nxt += 4;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+ src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
+
+ src1 += src0;
+ res0 = __msa_copy_s_w((v4i32) src1, 0);
+
+ SW(res0, nxt);
+ nxt += 4;
+
+ /* Remainder */
+ rp_end = row_info->rowbytes - 4;
+
+ for (count = 0; count < rp_end; count += 16)
+ {
+ src2 = LD_UB(prev_nxt);
+ prev_nxt += 16;
+ src6 = LD_UB(prev_row);
+ prev_row += 16;
+ src10 = LD_UB(nxt);
+
+ SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4);
+ SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8);
+ SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12);
+ ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
+ ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
+ ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
+ ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
+ src1 = src13;
+ ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1);
+ dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+
+ ST_UB(dst0, nxt);
+ nxt += 16;
+ }
+}
+
+void png_read_filter_row_paeth3_msa(png_row_infop row_info,
+ png_bytep row,
+ png_const_bytep prev_row)
+{
+ int32_t count, rp_end;
+ png_bytep nxt;
+ png_const_bytep prev_nxt;
+ int64_t out0;
+ int32_t inp0, inp1, out1;
+ int16_t out2;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
+ v16u8 src10, src11, src12, src13;
+ v8i16 vec0, vec1, vec2;
+ v16u8 zero = { 0 };
+ v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
+
+ nxt = row;
+ prev_nxt = prev_row;
+
+ inp0 = LW(nxt);
+ inp1 = LW(prev_nxt);
+ prev_nxt += 3;
+ src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
+ src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
+
+ src1 += src0;
+ out2 = __msa_copy_s_h((v8i16) src1, 0);
+
+ SH(out2, nxt);
+ nxt += 2;
+ nxt[0] = src1[2];
+ nxt++;
+
+ /* Remainder */
+ rp_end = row_info->rowbytes - 3;
+
+ for (count = 0; count < rp_end; count += 12)
+ {
+ src2 = LD_UB(prev_nxt);
+ prev_nxt += 12;
+ src6 = LD_UB(prev_row);
+ prev_row += 12;
+ src10 = LD_UB(nxt);
+
+ SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3);
+ SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6);
+ SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9);
+ ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
+ ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
+ ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
+ ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
+ HSUB_UB2_SH(vec0, vec1, vec0, vec1);
+ vec2 = vec0 + vec1;
+ ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
+ CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
+ src1 = src13;
+ VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1);
+ dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
+ out0 = __msa_copy_s_d((v2i64) dst0, 0);
+ out1 = __msa_copy_s_w((v4i32) dst0, 2);
+
+ SD(out0, nxt);
+ nxt += 8;
+ SW(out1, nxt);
+ nxt += 4;
+ }
+}
+
+#endif /* PNG_MIPS_MSA_OPT > 0 */
+#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */
+#endif /* READ */
--- /dev/null
+
+/* mips_init.c - MSA optimised filter functions
+ *
+ * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2016 Glenn Randers-Pehrson
+ * Written by Mandar Sahastrabuddhe, 2016.
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are
+ * called.
+ */
+#define _POSIX_SOURCE 1
+
+#include <stdio.h>
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+
+#if PNG_MIPS_MSA_OPT > 0
+#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */
+/* WARNING: it is strongly recommended that you do not build libpng with
+ * run-time checks for CPU features if at all possible. In the case of the MIPS
+ * MSA instructions there is no processor-specific way of detecting the
+ * presence of the required support, therefore run-time detection is extremely
+ * OS specific.
+ *
+ * You may set the macro PNG_MIPS_MSA_FILE to the file name of file containing
+ * a fragment of C source code which defines the png_have_msa function. There
+ * are a number of implementations in contrib/mips-msa, but the only one that
+ * has partial support is contrib/mips-msa/linux.c - a generic Linux
+ * implementation which reads /proc/cpufino.
+ */
+#ifndef PNG_MIPS_MSA_FILE
+# ifdef __linux__
+# define PNG_MIPS_MSA_FILE "contrib/mips-msa/linux.c"
+# endif
+#endif
+
+#ifdef PNG_MIPS_MSA_FILE
+
+#include <signal.h> /* for sig_atomic_t */
+static int png_have_msa(png_structp png_ptr);
+#include PNG_MIPS_MSA_FILE
+
+#else /* PNG_MIPS_MSA_FILE */
+# error "PNG_MIPS_MSA_FILE undefined: no support for run-time MIPS MSA checks"
+#endif /* PNG_MIPS_MSA_FILE */
+#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
+
+#ifndef PNG_ALIGNED_MEMORY_SUPPORTED
+# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED"
+#endif
+
+void
+png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
+{
+ /* The switch statement is compiled in for MIPS_MSA_API, the call to
+ * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined
+ * the check is only performed if the API has not set the MSA option on
+ * or off explicitly. In this case the check controls what happens.
+ */
+
+#ifdef PNG_MIPS_MSA_API_SUPPORTED
+ switch ((pp->options >> PNG_MIPS_MSA) & 3)
+ {
+ case PNG_OPTION_UNSET:
+ /* Allow the run-time check to execute if it has been enabled -
+ * thus both API and CHECK can be turned on. If it isn't supported
+ * this case will fall through to the 'default' below, which just
+ * returns.
+ */
+#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED
+ {
+ static volatile sig_atomic_t no_msa = -1; /* not checked */
+
+ if (no_msa < 0)
+ no_msa = !png_have_msa(pp);
+
+ if (no_msa)
+ return;
+ }
+#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
+ break;
+
+ default: /* OFF or INVALID */
+ return;
+
+ case PNG_OPTION_ON:
+ /* Option turned on */
+ break;
+ }
+ /* IMPORTANT: any new external functions used here must be declared using
+ * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the
+ * 'prefix' option to configure works:
+ *
+ * ./configure --with-libpng-prefix=foobar_
+ *
+ * Verify you have got this right by running the above command, doing a build
+ * and examining pngprefix.h; it must contain a #define for every external
+ * function you add. (Notice that this happens automatically for the
+ * initialization function.)
+ */
+ pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_msa;
+
+ if (bpp == 3)
+ {
+ pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_msa;
+ pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa;
+ pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa;
+ }
+ else if (bpp == 4)
+ {
+ pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa;
+ pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
+ pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
+ }
+#else
+ (void)pp;
+ (void)bpp;
+#endif /* PNG_MIPS_MSA_API_SUPPORTED */
+}
+#endif /* PNG_MIPS_MSA_OPT > 0 */
+#endif /* READ */
static inline TYPE FUNC_NAME(const void* const psrc) { \
const uint8_t* const psrc_m = (const uint8_t*)psrc; \
TYPE val_m; \
- asm volatile ( \
+ __asm__ volatile ( \
"" #INSTR " %[val_m], %[psrc_m] \n\t" \
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m)); \
static inline void FUNC_NAME(TYPE val, void* const pdst) { \
uint8_t* const pdst_m = (uint8_t*)pdst; \
TYPE val_m = val; \
- asm volatile ( \
+ __asm__ volatile ( \
" " #INSTR " %[val_m], %[pdst_m] \n\t" \
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m)); \
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CEL;AVX512_ICL")
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
ocv_update(CPU_FP16_IMPLIES "NEON")
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
+elseif(MIPS)
+ ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
+ ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA")
+ ocv_update(CPU_MSA_FLAGS_ON "-mmsa")
+ set(CPU_BASELINE "MSA" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(PPC64LE)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
set(PPC64LE 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
set(PPC64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
+ set(MIPS 1)
endif()
# Workaround for 32-bit operating systems on x86_64/aarch64 processor
--- /dev/null
+#include <stdio.h>
+
+#if defined(__mips_msa)
+# include <msa.h>
+# define CV_MSA 1
+#endif
+
+#if defined CV_MSA
+int test()
+{
+ const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
+ v4f32 val = (v4f32)__msa_ld_w((const float*)(src), 0);
+ return __msa_copy_s_w(__builtin_msa_ftint_s_w (val), 0);
+}
+#else
+#error "MSA is not supported"
+#endif
+
+int main()
+{
+ printf("%d\n", test());
+ return 0;
+}
# define CV_VSX3 1
#endif
+#ifdef CV_CPU_COMPILE_MSA
+# include "hal/msa_macros.h"
+# define CV_MSA 1
+#endif
+
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
#ifndef CV_VSX3
# define CV_VSX3 0
#endif
+
+#ifndef CV_MSA
+# define CV_MSA 0
+#endif
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
+# define CV_TRY_MSA 1
+# define CV_CPU_FORCE_MSA 1
+# define CV_CPU_HAS_SUPPORT_MSA 1
+# define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args)
+# define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA
+# define CV_TRY_MSA 1
+# define CV_CPU_FORCE_MSA 0
+# define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA))
+# define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+# define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+#else
+# define CV_TRY_MSA 0
+# define CV_CPU_FORCE_MSA 0
+# define CV_CPU_HAS_SUPPORT_MSA 0
+# define CV_CPU_CALL_MSA(fn, args)
+# define CV_CPU_CALL_MSA_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...) CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
# define CV_TRY_VSX 1
# define CV_CPU_FORCE_VSX 1
#define CV_CPU_NEON 100
+#define CV_CPU_MSA 150
+
#define CV_CPU_VSX 200
#define CV_CPU_VSX3 201
CPU_NEON = 100,
+ CPU_MSA = 150,
+
CPU_VSX = 200,
CPU_VSX3 = 201,
# undef CV_NEON
# undef CV_VSX
# undef CV_FP16
+# undef CV_MSA
#endif
-#if CV_SSE2 || CV_NEON || CV_VSX
+#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#include "opencv2/core/hal/intrin_vsx.hpp"
+#elif CV_MSA
+
+#include "opencv2/core/hal/intrin_msa.hpp"
+
#else
#define CV_SIMD128_CPP 1
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_MSA_HPP
+#define OPENCV_HAL_INTRIN_MSA_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+
+//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
+//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
+#define CV_SIMD128_64F 1
+
+struct v_uint8x16
+{
+ typedef uchar lane_type;
+ enum { nlanes = 16 };
+
+ v_uint8x16() : val(msa_dupq_n_u8(0)) {}
+ explicit v_uint8x16(v16u8 v) : val(v) {}
+ v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+ uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+ {
+ uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+ val = msa_ld1q_u8(v);
+ }
+ uchar get0() const
+ {
+ return msa_getq_lane_u8(val, 0);
+ }
+
+ v16u8 val;
+};
+
+struct v_int8x16
+{
+ typedef schar lane_type;
+ enum { nlanes = 16 };
+
+ v_int8x16() : val(msa_dupq_n_s8(0)) {}
+ explicit v_int8x16(v16i8 v) : val(v) {}
+ v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+ schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+ {
+ schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+ val = msa_ld1q_s8(v);
+ }
+ schar get0() const
+ {
+ return msa_getq_lane_s8(val, 0);
+ }
+
+ v16i8 val;
+};
+
+struct v_uint16x8
+{
+ typedef ushort lane_type;
+ enum { nlanes = 8 };
+
+ v_uint16x8() : val(msa_dupq_n_u16(0)) {}
+ explicit v_uint16x8(v8u16 v) : val(v) {}
+ v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+ {
+ ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+ val = msa_ld1q_u16(v);
+ }
+ ushort get0() const
+ {
+ return msa_getq_lane_u16(val, 0);
+ }
+
+ v8u16 val;
+};
+
+struct v_int16x8
+{
+ typedef short lane_type;
+ enum { nlanes = 8 };
+
+ v_int16x8() : val(msa_dupq_n_s16(0)) {}
+ explicit v_int16x8(v8i16 v) : val(v) {}
+ v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+ {
+ short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+ val = msa_ld1q_s16(v);
+ }
+ short get0() const
+ {
+ return msa_getq_lane_s16(val, 0);
+ }
+
+ v8i16 val;
+};
+
+struct v_uint32x4
+{
+ typedef unsigned int lane_type;
+ enum { nlanes = 4 };
+
+ v_uint32x4() : val(msa_dupq_n_u32(0)) {}
+ explicit v_uint32x4(v4u32 v) : val(v) {}
+ v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
+ {
+ unsigned int v[] = {v0, v1, v2, v3};
+ val = msa_ld1q_u32(v);
+ }
+ unsigned int get0() const
+ {
+ return msa_getq_lane_u32(val, 0);
+ }
+
+ v4u32 val;
+};
+
+struct v_int32x4
+{
+ typedef int lane_type;
+ enum { nlanes = 4 };
+
+ v_int32x4() : val(msa_dupq_n_s32(0)) {}
+ explicit v_int32x4(v4i32 v) : val(v) {}
+ v_int32x4(int v0, int v1, int v2, int v3)
+ {
+ int v[] = {v0, v1, v2, v3};
+ val = msa_ld1q_s32(v);
+ }
+ int get0() const
+ {
+ return msa_getq_lane_s32(val, 0);
+ }
+ v4i32 val;
+};
+
+struct v_float32x4
+{
+ typedef float lane_type;
+ enum { nlanes = 4 };
+
+ v_float32x4() : val(msa_dupq_n_f32(0.0f)) {}
+ explicit v_float32x4(v4f32 v) : val(v) {}
+ v_float32x4(float v0, float v1, float v2, float v3)
+ {
+ float v[] = {v0, v1, v2, v3};
+ val = msa_ld1q_f32(v);
+ }
+ float get0() const
+ {
+ return msa_getq_lane_f32(val, 0);
+ }
+ v4f32 val;
+};
+
+struct v_uint64x2
+{
+ typedef uint64 lane_type;
+ enum { nlanes = 2 };
+
+ v_uint64x2() : val(msa_dupq_n_u64(0)) {}
+ explicit v_uint64x2(v2u64 v) : val(v) {}
+ v_uint64x2(uint64 v0, uint64 v1)
+ {
+ uint64 v[] = {v0, v1};
+ val = msa_ld1q_u64(v);
+ }
+ uint64 get0() const
+ {
+ return msa_getq_lane_u64(val, 0);
+ }
+ v2u64 val;
+};
+
+struct v_int64x2
+{
+ typedef int64 lane_type;
+ enum { nlanes = 2 };
+
+ v_int64x2() : val(msa_dupq_n_s64(0)) {}
+ explicit v_int64x2(v2i64 v) : val(v) {}
+ v_int64x2(int64 v0, int64 v1)
+ {
+ int64 v[] = {v0, v1};
+ val = msa_ld1q_s64(v);
+ }
+ int64 get0() const
+ {
+ return msa_getq_lane_s64(val, 0);
+ }
+ v2i64 val;
+};
+
+struct v_float64x2
+{
+ typedef double lane_type;
+ enum { nlanes = 2 };
+
+ v_float64x2() : val(msa_dupq_n_f64(0.0f)) {}
+ explicit v_float64x2(v2f64 v) : val(v) {}
+ v_float64x2(double v0, double v1)
+ {
+ double v[] = {v0, v1};
+ val = msa_ld1q_f64(v);
+ }
+ double get0() const
+ {
+ return msa_getq_lane_f64(val, 0);
+ }
+ v2f64 val;
+};
+
+#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
+
+OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
+OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+ return _Tpvec(mov(a.val, b.val)); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+ return _Tpvec(rshr(a.val, b.val, n)); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
+OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
+OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
+OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
+
+#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+ hreg a1 = mov(a.val); \
+ msa_st1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+ hreg a1 = rshr(a.val, n); \
+ msa_st1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+ return v_uint8x16(msa_pack_u16(a.val, b.val));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+ const v_uint32x4& c, const v_uint32x4& d)
+{
+ return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+ const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+ const v_uint64x2& g, const v_uint64x2& h)
+{
+ v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
+ v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
+ return v_uint8x16(msa_pack_u16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+ const v_float32x4& m1, const v_float32x4& m2,
+ const v_float32x4& m3)
+{
+ v4f32 v0 = v.val;
+ v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+ res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+ res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+ res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
+ return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+ const v_float32x4& m1, const v_float32x4& m2,
+ const v_float32x4& a)
+{
+ v4f32 v0 = v.val;
+ v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+ res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+ res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+ res = msa_addq_f32(res, a.val);
+ return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+ a.val = intrin(a.val, b.val); \
+ return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+{ \
+ _Tpwvec c, d; \
+ v_mul_expand(a, b, c, d); \
+ return v_pack(c, d); \
+} \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
+{a = a * b; return a; }
+
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
+
+// Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+ v_int16x8& c, v_int16x8& d)
+{
+ v16i8 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
+ ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
+ c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
+ d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+ v_uint16x8& c, v_uint16x8& d)
+{
+ v16u8 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
+ ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
+ c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
+ d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+ v_int32x4& c, v_int32x4& d)
+{
+ v8i16 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+ ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+ c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
+ d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+ v_uint32x4& c, v_uint32x4& d)
+{
+ v8u16 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+ ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+ c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
+ d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+ v_uint64x2& c, v_uint64x2& d)
+{
+ v4u32 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
+ ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
+ c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
+ d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+ v8i16 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+ ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+
+ return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
+ msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
+}
+
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+ v8u16 a_lo, a_hi, b_lo, b_hi;
+
+ ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+ ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+
+ return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
+ msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+ return v_int32x4(msa_dotp_s_w(a.val, b.val));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+ a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
+ return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+/* v_abs */
+#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) \
+{ \
+ return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+/* v_abs(float), v_sqrt, v_invsqrt */
+#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a) \
+{ \
+ return _Tpvec(intrin(a.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+ a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
+ return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
+
+/** Saturating absolute difference **/
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
+
+/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+ v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+ return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+ return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+ return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+ return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+ return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+ return v_fma(a, b, c);
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+ v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+ return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+ return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+ return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+ return v_fma(a, b, c);
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+/* v_rotate_right, v_rotate_left */
+#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ \
+ return a; \
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ CV_UNUSED(b); \
+ return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+ int n = _Tpvec::nlanes; \
+ for( int i = 0; i < (n/2); i++ ) \
+ ptr[i] = a.val[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+ int n = _Tpvec::nlanes; \
+ for( int i = 0; i < (n/2); i++ ) \
+ ptr[i] = a.val[i+(n/2)]; \
+}
+
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
+inline unsigned short v_reduce_##func(const v_uint16x8& a) \
+{ \
+ v8u16 a_lo, a_hi; \
+ ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
+ v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
+ v4u32 b_lo, b_hi; \
+ ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
+ v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
+ return (unsigned short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
+inline short v_reduce_##func(const v_int16x8& a) \
+{ \
+ v8i16 a_lo, a_hi; \
+ ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
+ v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
+ v4i32 b_lo, b_hi; \
+ ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
+ v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
+ return (short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+ return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+ return (scalartype)msa_sum_##suffix(a.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
+inline double v_reduce_sum(const v_float64x2& a)
+{
+ return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
+}
+
+/* v_reduce_sum4, v_reduce_sad */
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+ const v_float32x4& c, const v_float32x4& d)
+{
+ v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
+ MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
+ v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
+ MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
+
+ return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
+ MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+ v16u8 t0 = msa_abdq_u8(a.val, b.val);
+ v8u16 t1 = msa_paddlq_u8(t0);
+ v4u32 t2 = msa_paddlq_u16(t1);
+ return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+ v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
+ v8u16 t1 = msa_paddlq_u8(t0);
+ v4u32 t2 = msa_paddlq_u16(t1);
+ return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+ v8u16 t0 = msa_abdq_u16(a.val, b.val);
+ v4u32 t1 = msa_paddlq_u16(t0);
+ return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+ v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
+ v4u32 t1 = msa_paddlq_u16(t0);
+ return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+ v4u32 t0 = msa_abdq_u32(a.val, b.val);
+ return msa_sum_u32(t0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+ v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
+ return msa_sum_u32(t0);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+ v4f32 t0 = msa_abdq_f32(a.val, b.val);
+ return msa_sum_f32(t0);
+}
+
+/* v_popcount */
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
+inline v_uint8x16 v_popcount(const _Tpvec& a) \
+{ \
+ v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
+ return v_uint8x16(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
+inline v_uint16x8 v_popcount(const _Tpvec& a) \
+{ \
+ v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
+ return v_uint16x8(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+ v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
+ return v_uint32x4(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
+inline v_uint64x2 v_popcount(const _Tpvec& a) \
+{ \
+ v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
+ return v_uint64x2(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+ v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
+ v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
+ v8u16 v1 = msa_paddlq_u8(v0);
+ v4u32 v2 = msa_paddlq_u16(v1);
+ v2u64 v3 = msa_paddlq_u32(v2);
+ return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+ v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
+ v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
+ v4u32 v1 = msa_paddlq_u16(v0);
+ v2u64 v2 = msa_paddlq_u32(v1);
+ return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+ v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
+ v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
+ v2u64 v1 = msa_paddlq_u32(v0);
+ return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+ v2u64 v0 = msa_shrq_n_u64(a.val, 63);
+ return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+ _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
+ v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+ return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+ _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
+ v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+ return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+/* v_select */
+#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
+ MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
+
+#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+ _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+ _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+ b0.val = msa_paddlq_##suffix(a_lo); \
+ b1.val = msa_paddlq_##suffix(a_hi); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+ _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+ return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+ _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+ return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+ return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+ return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+ return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+/* v_zip, v_combine_low, v_combine_high, v_recombine */
+#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+ b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+ b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+ c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+ d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
+
+/* v_extract */
+#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
+
+/* v_round, v_floor, v_ceil, v_trunc */
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+ return v_int32x4(msa_cvttintq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+ v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+ return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+ v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+ return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+ return v_int32x4(msa_cvttruncq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+ return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+ return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+ v2f64 a1 = msa_cvtrintq_f64(a.val);
+ return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+ v2f64 a1 = msa_cvtrintq_f64(a.val);
+ return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+ return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+ const _Tpvec& a2, const _Tpvec& a3, \
+ _Tpvec& b0, _Tpvec& b1, \
+ _Tpvec& b2, _Tpvec& b3) \
+{ \
+ _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+ _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+ _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+ _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+ b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+ b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+ b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+ b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+}
+
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
+
+#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+ msa_ld2q_##suffix(ptr, &a.val, &b.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+ msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+ v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+ msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+ msa_st2q_##suffix(ptr, a.val, b.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+ const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+ msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+ const v_##_Tpvec& c, const v_##_Tpvec& d, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+ msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
+
+/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+ return v_float32x4(msa_cvtfintq_f32_s32(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+ return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+ return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+ return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+ return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+ return v_float64x2(msa_cvtflq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+ return v_float64x2(msa_cvtfhq_f64_f32(a.val));
+}
+
+////////////// Lookup table access ////////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+ schar CV_DECL_ALIGNED(32) elems[16] =
+ {
+ tab[idx[ 0]],
+ tab[idx[ 1]],
+ tab[idx[ 2]],
+ tab[idx[ 3]],
+ tab[idx[ 4]],
+ tab[idx[ 5]],
+ tab[idx[ 6]],
+ tab[idx[ 7]],
+ tab[idx[ 8]],
+ tab[idx[ 9]],
+ tab[idx[10]],
+ tab[idx[11]],
+ tab[idx[12]],
+ tab[idx[13]],
+ tab[idx[14]],
+ tab[idx[15]]
+ };
+ return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+ schar CV_DECL_ALIGNED(32) elems[16] =
+ {
+ tab[idx[0]],
+ tab[idx[0] + 1],
+ tab[idx[1]],
+ tab[idx[1] + 1],
+ tab[idx[2]],
+ tab[idx[2] + 1],
+ tab[idx[3]],
+ tab[idx[3] + 1],
+ tab[idx[4]],
+ tab[idx[4] + 1],
+ tab[idx[5]],
+ tab[idx[5] + 1],
+ tab[idx[6]],
+ tab[idx[6] + 1],
+ tab[idx[7]],
+ tab[idx[7] + 1]
+ };
+ return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+ schar CV_DECL_ALIGNED(32) elems[16] =
+ {
+ tab[idx[0]],
+ tab[idx[0] + 1],
+ tab[idx[0] + 2],
+ tab[idx[0] + 3],
+ tab[idx[1]],
+ tab[idx[1] + 1],
+ tab[idx[1] + 2],
+ tab[idx[1] + 3],
+ tab[idx[2]],
+ tab[idx[2] + 1],
+ tab[idx[2] + 2],
+ tab[idx[2] + 3],
+ tab[idx[3]],
+ tab[idx[3] + 1],
+ tab[idx[3] + 2],
+ tab[idx[3] + 3]
+ };
+ return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+ short CV_DECL_ALIGNED(32) elems[8] =
+ {
+ tab[idx[0]],
+ tab[idx[1]],
+ tab[idx[2]],
+ tab[idx[3]],
+ tab[idx[4]],
+ tab[idx[5]],
+ tab[idx[6]],
+ tab[idx[7]]
+ };
+ return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+ short CV_DECL_ALIGNED(32) elems[8] =
+ {
+ tab[idx[0]],
+ tab[idx[0] + 1],
+ tab[idx[1]],
+ tab[idx[1] + 1],
+ tab[idx[2]],
+ tab[idx[2] + 1],
+ tab[idx[3]],
+ tab[idx[3] + 1]
+ };
+ return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+ return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+ int CV_DECL_ALIGNED(32) elems[4] =
+ {
+ tab[idx[0]],
+ tab[idx[1]],
+ tab[idx[2]],
+ tab[idx[3]]
+ };
+ return v_int32x4(msa_ld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+ return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+ return v_int32x4(msa_ld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+ return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+ return v_int64x2(msa_ld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+ float CV_DECL_ALIGNED(32) elems[4] =
+ {
+ tab[idx[0]],
+ tab[idx[1]],
+ tab[idx[2]],
+ tab[idx[3]]
+ };
+ return v_float32x4(msa_ld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+ uint64 CV_DECL_ALIGNED(32) elems[2] =
+ {
+ *(uint64*)(tab + idx[0]),
+ *(uint64*)(tab + idx[1])
+ };
+ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+ return v_float32x4(msa_ld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+ unsigned CV_DECL_ALIGNED(32) elems[4] =
+ {
+ tab[msa_getq_lane_s32(idxvec.val, 0)],
+ tab[msa_getq_lane_s32(idxvec.val, 1)],
+ tab[msa_getq_lane_s32(idxvec.val, 2)],
+ tab[msa_getq_lane_s32(idxvec.val, 3)]
+ };
+ return v_uint32x4(msa_ld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
+ v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
+ x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+ y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+ v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
+ return c;
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+ v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
+ return c;
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+ v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
+ return c;
+}
+
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+ v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
+ return c;
+}
+
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+ v_int32x4 c;
+ c.val[0] = vec.val[0];
+ c.val[1] = vec.val[2];
+ c.val[2] = vec.val[1];
+ c.val[3] = vec.val[3];
+ return c;
+}
+
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+ v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
+ return c;
+}
+
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+ v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
+ return c;
+}
+
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+ double CV_DECL_ALIGNED(32) elems[2] =
+ {
+ tab[idx[0]],
+ tab[idx[1]]
+ };
+ return v_float64x2(msa_ld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+ return v_float64x2(msa_ld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
+ v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
+ x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+ y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+}
+
+////// FP16 suport ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+#ifndef msa_ld1_f16
+ v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
+#else
+ v4f16 v = msa_ld1_f16((const __fp16*)ptr);
+#endif
+ return v_float32x4(msa_cvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+ v4f16 hv = msa_cvt_f16_f32(v.val);
+
+#ifndef msa_st1_f16
+ msa_st1_s16((short*)ptr, (int16x4_t)hv);
+#else
+ msa_st1_f16((__fp16*)ptr, hv);
+#endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+ float buf[4];
+ for( int i = 0; i < 4; i++ )
+ buf[i] = (float)ptr[i];
+ return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+ float buf[4];
+ v_store(buf, v);
+ for( int i = 0; i < 4; i++ )
+ ptr[i] = (float16_t)buf[i];
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_HAL_MSA_MACROS_H
+#define OPENCV_CORE_HAL_MSA_MACROS_H
+
+#ifdef __mips_msa
+#include "msa.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define 64 bits vector types */
+typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8)));
+typedef short v4i16 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8)));
+typedef int v2i32 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8)));
+typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8)));
+typedef float v2f32 __attribute__ ((vector_size(8), aligned(8)));
+typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
+
+
+/* Load values from the given memory a 64-bit vector. */
+#define msa_ld1_s8(__a) (*((v8i8*)(__a)))
+#define msa_ld1_s16(__a) (*((v4i16*)(__a)))
+#define msa_ld1_s32(__a) (*((v2i32*)(__a)))
+#define msa_ld1_s64(__a) (*((v1i64*)(__a)))
+#define msa_ld1_u8(__a) (*((v8u8*)(__a)))
+#define msa_ld1_u16(__a) (*((v4u16*)(__a)))
+#define msa_ld1_u32(__a) (*((v2u32*)(__a)))
+#define msa_ld1_u64(__a) (*((v1u64*)(__a)))
+#define msa_ld1_f32(__a) (*((v2f32*)(__a)))
+#define msa_ld1_f64(__a) (*((v1f64*)(__a)))
+
+/* Load values from the given memory address to a 128-bit vector */
+#define msa_ld1q_s8(__a) ((v16i8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_u8(__a) ((v16u8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
+
+/* Store 64bits vector elments values to the given memory address. */
+#define msa_st1_s8(__a, __b) (*((v8i8*)(__a)) = __b)
+#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
+#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
+#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
+#define msa_st1_u8(__a, __b) (*((v8u8*)(__a)) = __b)
+#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
+#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
+#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
+#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
+#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
+
+/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */
+#define msa_st1q_s8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_u8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+
+/* Store the value of the element with the index __c in vector __a to the given memory address __a. */
+#define msa_st1_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
+#define msa_st1_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
+#define msa_st1q_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
+#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
+#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
+#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
+#define msa_st1q_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
+#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
+#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
+#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
+#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
+#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
+
+/* Duplicate elements for 64-bit doubleword vectors */
+#define msa_dup_n_s8(__a) ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_s32(__a) ((v2i32){__a, __a})
+#define msa_dup_n_s64(__a) ((v1i64){__a})
+#define msa_dup_n_u8(__a) ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_u32(__a) ((v2u32){__a, __a})
+#define msa_dup_n_u64(__a) ((v1u64){__a})
+#define msa_dup_n_f32(__a) ((v2f32){__a, __a})
+#define msa_dup_n_f64(__a) ((v1f64){__a})
+
+/* Duplicate elements for 128-bit quadword vectors */
+#define msa_dupq_n_s8(__a) (__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_u8(__a) ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
+#define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
+#define msa_dupq_lane_s8(__a, __b) (__builtin_msa_splat_b(__a, __b))
+#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
+#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
+#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
+#define msa_dupq_lane_u8(__a, __b) ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
+#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
+#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
+#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
+
+/* Create a 64 bits vector */
+#define msa_create_s8(__a) ((v8i8)((uint64_t)(__a)))
+#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
+#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
+#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
+#define msa_create_u8(__a) ((v8u8)((uint64_t)(__a)))
+#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
+#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
+#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
+#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
+#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
+
+/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */
+/*Transform v8i8 to v8i16*/
+#define msa_movl_s8(__a) \
+((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+ (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v8u8 to v8u16*/
+#define msa_movl_u8(__a) \
+((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+ (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v4i16 to v8i16*/
+#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2i32 to v4i32*/
+#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
+
+/*Transform v4u16 to v8u16*/
+#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2u32 to v4u32*/
+#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
+
+/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */
+#define msa_movn_s16(__a) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+ (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s32(__a) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+ (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s64(__a) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+ (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u16(__a) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u32(__a) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u64(__a) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovn */
+#define msa_qmovn_s16(__a) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
+ (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s32(__a) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
+ (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s64(__a) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
+ (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u16(__a) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u32(__a) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u64(__a) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovun */
+#define msa_qmovun_s16(__a) \
+({ \
+ v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+ v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s32(__a) \
+({ \
+ v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+ v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s64(__a) \
+({ \
+ v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
+ v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_shrn_n_s16(__a, __b) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
+ (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s32(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
+ (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s64(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
+ (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u16(__a, __b) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u32(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u64(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_rshrn_n_s16(__a, __b) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
+ (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s32(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
+ (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s64(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
+ (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u16(__a, __b) \
+({ \
+ v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u32(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u64(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */
+#define msa_qrshrn_n_s16(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
+ v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+ (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s32(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
+ v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+ (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s64(__a, __b) \
+({ \
+ v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
+ v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+ (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u16(__a, __b) \
+({ \
+ v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
+ v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u32(__a, __b) \
+({ \
+ v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
+ v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u64(__a, __b) \
+({ \
+ v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
+ v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector.
+ Input is signed and outpus is unsigned. */
+#define msa_qrshrun_n_s16(__a, __b) \
+({ \
+ v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
+ v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+ (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s32(__a, __b) \
+({ \
+ v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
+ v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+ (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s64(__a, __b) \
+({ \
+ v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
+ v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+ (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* pack */
+#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+
+/* qpack */
+#define msa_qpack_s16(__a, __b) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
+#define msa_qpack_s32(__a, __b) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
+#define msa_qpack_s64(__a, __b) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
+#define msa_qpack_u16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
+#define msa_qpack_u32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
+#define msa_qpack_u64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
+
+/* qpacku */
+#define msa_qpacku_s16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
+ (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
+#define msa_qpacku_s32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
+ (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
+#define msa_qpacku_s64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
+ (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
+
+/* packr */
+#define msa_packr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
+#define msa_packr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
+
+/* rpackr */
+#define msa_rpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
+#define msa_rpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
+
+/* qrpackr */
+#define msa_qrpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
+ (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
+ (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
+ (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
+#define msa_qrpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
+ (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
+ (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
+ (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
+
+/* qrpackru */
+#define msa_qrpackru_s16(__a, __b, __c) \
+({ \
+ v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
+ v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
+ (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+})
+
+#define msa_qrpackru_s32(__a, __b, __c) \
+({ \
+ v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
+ v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
+ (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+})
+
+#define msa_qrpackru_s64(__a, __b, __c) \
+({ \
+ v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
+ v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
+ (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+})
+
+/* Minimum values between corresponding elements in the two vectors are written to teh returned vector. */
+#define msa_minq_s8(__a, __b) (__builtin_msa_min_s_b(__a, __b))
+#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
+#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
+#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
+#define msa_minq_u8(__a, __b) ((v16u8)__builtin_msa_min_u_b(__a, __b))
+#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
+#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
+#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
+#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
+#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
+
+/* Maximum values between corresponding elements in the two vectors are written to teh returned vector. */
+#define msa_maxq_s8(__a, __b) (__builtin_msa_max_s_b(__a, __b))
+#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
+#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
+#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
+#define msa_maxq_u8(__a, __b) ((v16u8)__builtin_msa_max_u_b(__a, __b))
+#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
+#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
+#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
+#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
+#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
+
+/* Vector type reinterpretion */
+#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
+
+/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */
+/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */
+#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
+/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */
+#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
+/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */
+#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckev_s8(__a, __b) (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckod_s8(__a, __b) (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
+
+#ifdef _MIPSEB
+#define LANE_IMM0_1(x) (0b1 - ((x) & 0b1))
+#define LANE_IMM0_3(x) (0b11 - ((x) & 0b11))
+#define LANE_IMM0_7(x) (0b111 - ((x) & 0b111))
+#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
+#else
+#define LANE_IMM0_1(x) ((x) & 0b1)
+#define LANE_IMM0_3(x) ((x) & 0b11)
+#define LANE_IMM0_7(x) ((x) & 0b111)
+#define LANE_IMM0_15(x) ((x) & 0b1111)
+#endif
+
+#define msa_get_lane_u8(__a, __b) ((uint8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_s8(__a, __b) ((int8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_u16(__a, __b) ((uint16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s16(__a, __b) ((int16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_u32(__a, __b) ((uint32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_s32(__a, __b) ((int32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s64(__a, __b) ((int64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_u64(__a, __b) ((uint64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
+#define msa_getq_lane_u8(__a, imm0_15) ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
+#define msa_getq_lane_s8(__a, imm0_15) ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
+#define msa_getq_lane_u16(__a, imm0_7) ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
+#define msa_getq_lane_s16(__a, imm0_7) ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
+#define msa_getq_lane_u32(__a, imm0_3) __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
+#define msa_getq_lane_s32 __builtin_msa_copy_s_w
+#define msa_getq_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_getq_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
+#if (__mips == 64)
+#define msa_getq_lane_u64(__a, imm0_1) __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
+#define msa_getq_lane_s64 __builtin_msa_copy_s_d
+#else
+#define msa_getq_lane_u64(__a, imm0_1) ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#define msa_getq_lane_s64(__a, imm0_1) ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#endif
+
+/* combine */
+#if (__mips == 64)
+#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
+#else
+#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1], \
+ ((v2u32)(b))[0], ((v2u32)(b))[1]}))
+#endif
+
+/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */
+#define msa_combine_s8(__a, __b) __COMBINE_64_64(v16i8, __a, __b)
+
+/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */
+#define msa_combine_s16(__a, __b) __COMBINE_64_64(v8i16, __a, __b)
+
+/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */
+#define msa_combine_s32(__a, __b) __COMBINE_64_64(v4i32, __a, __b)
+
+/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */
+#define msa_combine_s64(__a, __b) __COMBINE_64_64(v2i64, __a, __b)
+
+/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */
+#define msa_combine_f32(__a, __b) __COMBINE_64_64(v4f32, __a, __b)
+
+/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */
+#define msa_combine_u8(__a, __b) __COMBINE_64_64(v16u8, __a, __b)
+
+/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */
+#define msa_combine_u16(__a, __b) __COMBINE_64_64(v8u16, __a, __b)
+
+/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */
+#define msa_combine_u32(__a, __b) __COMBINE_64_64(v4u32, __a, __b)
+
+/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */
+#define msa_combine_u64(__a, __b) __COMBINE_64_64(v2u64, __a, __b)
+
+/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */
+#define msa_combine_f64(__a, __b) __COMBINE_64_64(v2f64, __a, __b)
+
+/* get_low, get_high */
+#if (__mips == 64)
+#define __GET_LOW(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
+#define __GET_HIGH(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
+#else
+#define __GET_LOW(__TYPE, a) ((__TYPE)(((v2u64)(a))[0]))
+#define __GET_HIGH(__TYPE, a) ((__TYPE)(((v2u64)(a))[1]))
+#endif
+
+/* v8i8 msa_get_low_s8(v16i8 __a) */
+#define msa_get_low_s8(__a) __GET_LOW(v8i8, __a)
+
+/* v4i16 msa_get_low_s16(v8i16 __a) */
+#define msa_get_low_s16(__a) __GET_LOW(v4i16, __a)
+
+/* v2i32 msa_get_low_s32(v4i32 __a) */
+#define msa_get_low_s32(__a) __GET_LOW(v2i32, __a)
+
+/* v1i64 msa_get_low_s64(v2i64 __a) */
+#define msa_get_low_s64(__a) __GET_LOW(v1i64, __a)
+
+/* v8u8 msa_get_low_u8(v16u8 __a) */
+#define msa_get_low_u8(__a) __GET_LOW(v8u8, __a)
+
+/* v4u16 msa_get_low_u16(v8u16 __a) */
+#define msa_get_low_u16(__a) __GET_LOW(v4u16, __a)
+
+/* v2u32 msa_get_low_u32(v4u32 __a) */
+#define msa_get_low_u32(__a) __GET_LOW(v2u32, __a)
+
+/* v1u64 msa_get_low_u64(v2u64 __a) */
+#define msa_get_low_u64(__a) __GET_LOW(v1u64, __a)
+
+/* v2f32 msa_get_low_f32(v4f32 __a) */
+#define msa_get_low_f32(__a) __GET_LOW(v2f32, __a)
+
+/* v1f64 msa_get_low_f64(v2f64 __a) */
+#define msa_get_low_f64(__a) __GET_LOW(v1f64, __a)
+
+/* v8i8 msa_get_high_s8(v16i8 __a) */
+#define msa_get_high_s8(__a) __GET_HIGH(v8i8, __a)
+
+/* v4i16 msa_get_high_s16(v8i16 __a) */
+#define msa_get_high_s16(__a) __GET_HIGH(v4i16, __a)
+
+/* v2i32 msa_get_high_s32(v4i32 __a) */
+#define msa_get_high_s32(__a) __GET_HIGH(v2i32, __a)
+
+/* v1i64 msa_get_high_s64(v2i64 __a) */
+#define msa_get_high_s64(__a) __GET_HIGH(v1i64, __a)
+
+/* v8u8 msa_get_high_u8(v16u8 __a) */
+#define msa_get_high_u8(__a) __GET_HIGH(v8u8, __a)
+
+/* v4u16 msa_get_high_u16(v8u16 __a) */
+#define msa_get_high_u16(__a) __GET_HIGH(v4u16, __a)
+
+/* v2u32 msa_get_high_u32(v4u32 __a) */
+#define msa_get_high_u32(__a) __GET_HIGH(v2u32, __a)
+
+/* v1u64 msa_get_high_u64(v2u64 __a) */
+#define msa_get_high_u64(__a) __GET_HIGH(v1u64, __a)
+
+/* v2f32 msa_get_high_f32(v4f32 __a) */
+#define msa_get_high_f32(__a) __GET_HIGH(v2f32, __a)
+
+/* v1f64 msa_get_high_f64(v2f64 __a) */
+#define msa_get_high_f64(__a) __GET_HIGH(v1f64, __a)
+
+/* ri = ai * b[lane] */
+/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */
+#define msa_mulq_lane_f32(__a, __b, __lane) ((__a) * msa_getq_lane_f32(__b, __lane))
+
+/* ri = ai + bi * c[lane] */
+/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */
+#define msa_mlaq_lane_f32(__a, __b, __c, __lane) ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
+
+/* uint16_t msa_sum_u16(v8u16 __a)*/
+#define msa_sum_u16(__a) \
+({ \
+ v4u32 _b; \
+ v2u64 _c; \
+ _b = __builtin_msa_hadd_u_w(__a, __a); \
+ _c = __builtin_msa_hadd_u_d(_b, _b); \
+ (uint16_t)(_c[0] + _c[1]); \
+})
+
+/* int16_t msa_sum_s16(v8i16 __a) */
+#define msa_sum_s16(__a) \
+({ \
+ v4i32 _b; \
+ v2i64 _c; \
+ _b = __builtin_msa_hadd_s_w(__a, __a); \
+ _c = __builtin_msa_hadd_s_d(_b, _b); \
+ (int16_t)(_c[0] + _c[1]); \
+})
+
+
+/* uint32_t msa_sum_u32(v4u32 __a)*/
+#define msa_sum_u32(__a) \
+({ \
+ v2u64 _b; \
+ _b = __builtin_msa_hadd_u_d(__a, __a); \
+ (uint32_t)(_b[0] + _b[1]); \
+})
+
+/* int32_t msa_sum_s32(v4i32 __a)*/
+#define msa_sum_s32(__a) \
+({ \
+ v2i64 _b; \
+ _b = __builtin_msa_hadd_s_d(__a, __a); \
+ (int32_t)(_b[0] + _b[1]); \
+})
+
+/* uint8_t msa_sum_u8(v16u8 __a)*/
+#define msa_sum_u8(__a) \
+({ \
+ v8u16 _b16; \
+ v4u32 _c32; \
+ _b16 = __builtin_msa_hadd_u_h(__a, __a); \
+ _c32 = __builtin_msa_hadd_u_w(_b16, _b16); \
+ (uint8_t)msa_sum_u32(_c32); \
+})
+
+/* int8_t msa_sum_s8(v16s8 __a)*/
+#define msa_sum_s8(__a) \
+({ \
+ v8i16 _b16; \
+ v4i32 _c32; \
+ _b16 = __builtin_msa_hadd_s_h(__a, __a); \
+ _c32 = __builtin_msa_hadd_s_w(_b16, _b16); \
+ (int8_t)msa_sum_s32(_c32); \
+})
+
+/* float msa_sum_f32(v4f32 __a)*/
+#define msa_sum_f32(__a) ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
+
+/* v8u16 msa_paddlq_u8(v16u8 __a) */
+#define msa_paddlq_u8(__a) (__builtin_msa_hadd_u_h(__a, __a))
+
+/* v8i16 msa_paddlq_s8(v16i8 __a) */
+#define msa_paddlq_s8(__a) (__builtin_msa_hadd_s_h(__a, __a))
+
+/* v4u32 msa_paddlq_u16 (v8u16 __a)*/
+#define msa_paddlq_u16(__a) (__builtin_msa_hadd_u_w(__a, __a))
+
+/* v4i32 msa_paddlq_s16 (v8i16 __a)*/
+#define msa_paddlq_s16(__a) (__builtin_msa_hadd_s_w(__a, __a))
+
+/* v2u64 msa_paddlq_u32(v4u32 __a) */
+#define msa_paddlq_u32(__a) (__builtin_msa_hadd_u_d(__a, __a))
+
+/* v2i64 msa_paddlq_s32(v4i32 __a) */
+#define msa_paddlq_s32(__a) (__builtin_msa_hadd_s_d(__a, __a))
+
+#define V8U8_2_V8U16(x) {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
+ (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
+#define V8U8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+ (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V8I8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+ (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V4U16_2_V4U32(x) {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
+#define V4U16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V4I16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V2U32_2_V2U64(x) {(uint64_t)x[0], (uint64_t)x[1]}
+#define V2U32_2_V2I64(x) {(int64_t)x[0], (int64_t)x[1]}
+
+/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */
+#define msa_mull_u8(__a, __b) ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
+
+/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/
+#define msa_mull_s8(__a, __b) (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
+
+/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */
+#define msa_mull_u16(__a, __b) ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
+
+/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */
+#define msa_mull_s16(__a, __b) (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
+
+/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */
+#define msa_mull_u32(__a, __b) ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
+
+/* bitwise and: __builtin_msa_and_v */
+#define msa_andq_u8(__a, __b) ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s8(__a, __b) ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise or: __builtin_msa_or_v */
+#define msa_orrq_u8(__a, __b) ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s8(__a, __b) ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise xor: __builtin_msa_xor_v */
+#define msa_eorq_u8(__a, __b) ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s8(__a, __b) ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */
+#define msa_mvnq_u8(__a) ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s8(__a) ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+
+/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */
+#define msa_ceqq_u8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_s8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
+
+/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */
+#define msa_cltq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cltq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */
+#define msa_cgtq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgtq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
+
+/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */
+#define msa_cleq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cleq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */
+#define msa_cgeq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgeq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
+
+/* Shift Left Logical: shl -> ri = ai << bi; */
+#define msa_shlq_u8(__a, __b) ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_s8(__a, __b) ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Left Logical: shl -> ri = ai << imm; */
+#define msa_shlq_n_u8(__a, __imm) ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_s8(__a, __imm) ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+
+/* shift right: shrq -> ri = ai >> bi; */
+#define msa_shrq_u8(__a, __b) ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_s8(__a, __b) ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Right: shr -> ri = ai >> imm; */
+#define msa_shrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
+#define msa_shrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
+#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
+#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
+#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
+#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
+#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
+#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
+
+/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */
+#define msa_rshrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
+#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
+
+/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */
+#define msa_qrshrq_s32(a, b) ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
+
+/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */
+#define msa_qaddq_u8 __builtin_msa_adds_u_b
+#define msa_qaddq_s8 __builtin_msa_adds_s_b
+#define msa_qaddq_u16 __builtin_msa_adds_u_h
+#define msa_qaddq_s16 __builtin_msa_adds_s_h
+#define msa_qaddq_u32 __builtin_msa_adds_u_w
+#define msa_qaddq_s32 __builtin_msa_adds_s_w
+#define msa_qaddq_u64 __builtin_msa_adds_u_d
+#define msa_qaddq_s64 __builtin_msa_adds_s_d
+#define msa_addq_u8(a, b) ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
+#define msa_addq_s8 __builtin_msa_addv_b
+#define msa_addq_u16(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
+#define msa_addq_s16 __builtin_msa_addv_h
+#define msa_addq_u32(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
+#define msa_addq_s32 __builtin_msa_addv_w
+#define msa_addq_f32 __builtin_msa_fadd_w
+#define msa_addq_u64(a, b) ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
+#define msa_addq_s64 __builtin_msa_addv_d
+#define msa_addq_f64 __builtin_msa_fadd_d
+#define msa_qsubq_u8 __builtin_msa_subs_u_b
+#define msa_qsubq_s8 __builtin_msa_subs_s_b
+#define msa_qsubq_u16 __builtin_msa_subs_u_h
+#define msa_qsubq_s16 __builtin_msa_subs_s_h
+#define msa_subq_u8(a, b) ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
+#define msa_subq_s8 __builtin_msa_subv_b
+#define msa_subq_u16(a, b) ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
+#define msa_subq_s16 __builtin_msa_subv_h
+#define msa_subq_u32(a, b) ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
+#define msa_subq_s32 __builtin_msa_subv_w
+#define msa_subq_f32 __builtin_msa_fsub_w
+#define msa_subq_u64(a, b) ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
+#define msa_subq_s64 __builtin_msa_subv_d
+#define msa_subq_f64 __builtin_msa_fsub_d
+#define msa_mulq_u8(a, b) ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_s8(a, b) ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_u16(a, b) ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_s16(a, b) ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_u32(a, b) ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_s32(a, b) ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_u64(a, b) ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_s64(a, b) ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_f32 __builtin_msa_fmul_w
+#define msa_mulq_f64 __builtin_msa_fmul_d
+#define msa_divq_f32 __builtin_msa_fdiv_w
+#define msa_divq_f64 __builtin_msa_fdiv_d
+#define msa_dotp_s_h __builtin_msa_dotp_s_h
+#define msa_dotp_s_w __builtin_msa_dotp_s_w
+#define msa_dotp_s_d __builtin_msa_dotp_s_d
+#define msa_dotp_u_h __builtin_msa_dotp_u_h
+#define msa_dotp_u_w __builtin_msa_dotp_u_w
+#define msa_dotp_u_d __builtin_msa_dotp_u_d
+#define msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define msa_dpadd_u_d __builtin_msa_dpadd_u_d
+
+#define ILVRL_B2(RTYPE, in0, in1, low, hi) do { \
+ low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1)); \
+ hi = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1)); \
+ } while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, low, hi) do { \
+ low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1)); \
+ hi = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1)); \
+ } while (0)
+#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
+#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
+#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, low, hi) do { \
+ low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1)); \
+ hi = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1)); \
+ } while (0)
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
+
+/* absq, qabsq (r = |a|;) */
+#define msa_absq_s8(a) __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
+#define msa_absq_s16(a) __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
+#define msa_absq_s32(a) __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
+#define msa_absq_s64(a) __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
+#define msa_absq_f32(a) ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
+#define msa_absq_f64(a) ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
+#define msa_qabsq_s8(a) __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
+#define msa_qabsq_s16(a) __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
+#define msa_qabsq_s32(a) __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
+#define msa_qabsq_s64(a) __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
+
+/* abdq, qabdq (r = |a - b|;) */
+#define msa_abdq_u8 __builtin_msa_asub_u_b
+#define msa_abdq_s8 __builtin_msa_asub_s_b
+#define msa_abdq_u16 __builtin_msa_asub_u_h
+#define msa_abdq_s16 __builtin_msa_asub_s_h
+#define msa_abdq_u32 __builtin_msa_asub_u_w
+#define msa_abdq_s32 __builtin_msa_asub_s_w
+#define msa_abdq_u64 __builtin_msa_asub_u_d
+#define msa_abdq_s64 __builtin_msa_asub_s_d
+#define msa_abdq_f32(a, b) msa_absq_f32(__builtin_msa_fsub_w(a, b))
+#define msa_abdq_f64(a, b) msa_absq_f64(__builtin_msa_fsub_d(a, b))
+#define msa_qabdq_s8(a, b) msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
+#define msa_qabdq_s16(a, b) msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
+#define msa_qabdq_s32(a, b) msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
+#define msa_qabdq_s64(a, b) msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
+
+/* sqrtq, rsqrtq */
+#define msa_sqrtq_f32 __builtin_msa_fsqrt_w
+#define msa_sqrtq_f64 __builtin_msa_fsqrt_d
+#define msa_rsqrtq_f32 __builtin_msa_frsqrt_w
+#define msa_rsqrtq_f64 __builtin_msa_frsqrt_d
+
+
+/* mlaq: r = a + b * c; */
+__extension__ extern __inline v4i32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
+{
+ __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n"
+ // Outputs
+ : [__a] "+f"(__a)
+ // Inputs
+ : [__b] "f"(__b), [__c] "f"(__c));
+ return __a;
+}
+
+__extension__ extern __inline v2i64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
+{
+ __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n"
+ // Outputs
+ : [__a] "+f"(__a)
+ // Inputs
+ : [__b] "f"(__b), [__c] "f"(__c));
+ return __a;
+}
+
+__extension__ extern __inline v4f32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
+{
+ __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n"
+ // Outputs
+ : [__a] "+f"(__a)
+ // Inputs
+ : [__b] "f"(__b), [__c] "f"(__c));
+ return __a;
+}
+
+__extension__ extern __inline v2f64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
+{
+ __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n"
+ // Outputs
+ : [__a] "+f"(__a)
+ // Inputs
+ : [__b] "f"(__b), [__c] "f"(__c));
+ return __a;
+}
+
+/* cntq */
+#define msa_cntq_s8 __builtin_msa_pcnt_b
+#define msa_cntq_s16 __builtin_msa_pcnt_h
+#define msa_cntq_s32 __builtin_msa_pcnt_w
+#define msa_cntq_s64 __builtin_msa_pcnt_d
+
+/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */
+#define msa_bslq_u8 __builtin_msa_bsel_v
+
+/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */
+#define msa_ilvrq_s8 __builtin_msa_ilvr_b
+#define msa_ilvrq_s16 __builtin_msa_ilvr_h
+#define msa_ilvrq_s32 __builtin_msa_ilvr_w
+#define msa_ilvrq_s64 __builtin_msa_ilvr_d
+#define msa_ilvlq_s8 __builtin_msa_ilvl_b
+#define msa_ilvlq_s16 __builtin_msa_ilvl_h
+#define msa_ilvlq_s32 __builtin_msa_ilvl_w
+#define msa_ilvlq_s64 __builtin_msa_ilvl_d
+
+/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */
+#define msa_ilvevq_s8 __builtin_msa_ilvev_b
+#define msa_ilvevq_s16 __builtin_msa_ilvev_h
+#define msa_ilvevq_s32 __builtin_msa_ilvev_w
+#define msa_ilvevq_s64 __builtin_msa_ilvev_d
+#define msa_ilvodq_s8 __builtin_msa_ilvod_b
+#define msa_ilvodq_s16 __builtin_msa_ilvod_h
+#define msa_ilvodq_s32 __builtin_msa_ilvod_w
+#define msa_ilvodq_s64 __builtin_msa_ilvod_d
+
+/* extq (r = (a || b); a concatenation b and get elements from index c) */
+#ifdef _MIPSEB
+#define msa_extq_s8(a, b, c) \
+(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
+#else
+#define msa_extq_s8(a, b, c) \
+(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
+#endif /* _MIPSEB */
+
+/* cvttruncq, cvttintq, cvtrintq */
+#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
+#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
+#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
+#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
+#define msa_cvttintq_u32_f32 __builtin_msa_ftint_u_w
+#define msa_cvttintq_s32_f32 __builtin_msa_ftint_s_w
+#define msa_cvttintq_u64_f64 __builtin_msa_ftint_u_d
+#define msa_cvttintq_s64_f64 __builtin_msa_ftint_s_d
+#define msa_cvtrintq_f32 __builtin_msa_frint_w
+#define msa_cvtrintq_f64 __builtin_msa_frint_d
+
+/* cvtfintq, cvtfq */
+#define msa_cvtfintq_f32_u32 __builtin_msa_ffint_u_w
+#define msa_cvtfintq_f32_s32 __builtin_msa_ffint_s_w
+#define msa_cvtfintq_f64_u64 __builtin_msa_ffint_u_d
+#define msa_cvtfintq_f64_s64 __builtin_msa_ffint_s_d
+#define msa_cvtfq_f32_f64 __builtin_msa_fexdo_w
+#define msa_cvtflq_f64_f32 __builtin_msa_fexupr_d
+#define msa_cvtfhq_f64_f32 __builtin_msa_fexupl_d
+
+#define msa_addl_u8(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
+#define msa_addl_s8(a, b) (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
+#define msa_addl_u16(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
+#define msa_addl_s16(a, b) (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_subl_s16(a, b) (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_recpeq_f32 __builtin_msa_frcp_w
+#define msa_recpsq_f32(a, b) (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
+
+#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+ *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+ *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
+{ \
+ msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
+ msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+ _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
+ *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+ v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
+ *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+ v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
+ *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+ _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
+ *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
+ *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
+ *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+ _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
+ *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
+ *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
+ *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+ _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
+ *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
+ *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
+ v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
+ *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ _Tpv v00 = msa_ld1q_##suffix(ptr); \
+ _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
+ _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
+ _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
+ _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
+ _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
+ *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
+ *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
+ *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+ *((_Tp*)a) = *ptr; *((_Tp*)b) = *(ptr + 1); *((_Tp*)c) = *(ptr + 2); \
+ *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+ v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
+ v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
+ msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+ *ptr = a[0]; *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
+ *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
+}
+
+MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
+ _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
+ _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+ _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
+ _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+ _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
+ *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
+ *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
+ *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
+ *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+ _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
+ _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
+ _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
+ _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
+ msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
+ msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
+ msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
+ msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+ _Tpv v0 = msa_ld1q_##suffix(ptr); \
+ _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
+ _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
+ _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
+ *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
+ *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
+ *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
+ *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+ msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
+ msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
+ msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
+ msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64)
+
+__extension__ extern __inline v8i16
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_qdmulhq_n_s16(v8i16 a, int16_t b)
+{
+ v8i16 a_lo, a_hi;
+ ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
+ return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
+ msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /*__mips_msa*/
+#endif /* OPENCV_CORE_MSA_MACROS_H */
int x = 0;
#if CV_SIMD
- #if !CV_NEON
+ #if !CV_NEON && !CV_MSA
if (is_aligned(src1, src2, dst))
{
for (; x <= width - wide_step_l; x += wide_step_l)
{
ldr::la(src1 + x, src2 + x, dst + x);
- #if !CV_NEON && CV_SIMD_WIDTH == 16
+ #if CV_SIMD_WIDTH == 16
ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
#endif
}
i += blockSize;
}
}
+#elif CV_MSA
+ int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
+ v4i32 v_zero = msa_dupq_n_s32(0);
+ CV_DECL_ALIGNED(16) int buf[4];
+
+ while( i < len0 )
+ {
+ blockSize = std::min(len0 - i, blockSize0);
+ v4i32 v_sum = v_zero;
+
+ int j = 0;
+ for( ; j <= blockSize - 16; j += 16 )
+ {
+ v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
+
+ v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
+ v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
+ v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
+
+ v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
+ v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
+ v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
+ v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
+ }
+
+ for( ; j <= blockSize - 8; j += 8 )
+ {
+ v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
+ v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
+ v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
+ }
+
+ msa_st1q_s32(buf, v_sum);
+ r += buf[0] + buf[1] + buf[2] + buf[3];
+
+ src1 += blockSize;
+ src2 += blockSize;
+ i += blockSize;
+ }
#endif
return r + dotProd_(src1, src2, len - i);
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0)
# elif defined __GNUC__ && defined __arm__
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0)
+# elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2
+# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0)
# elif defined __GNUC__ && defined __PPC64__
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0)
# else
g_hwFeatureNames[CPU_VSX] = "VSX";
g_hwFeatureNames[CPU_VSX3] = "VSX3";
+ g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
+
g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
#if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
have[CV_CPU_NEON] = true;
#endif
+ #ifdef __mips_msa
+ have[CV_CPU_MSA] = true;
+ #endif
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
have[CV_CPU_VSX] = (CV_VSX);
// TODO: Check VSX3 availability in runtime for other platforms
--- /dev/null
+# ----------------------------------------------------------------------------------------------
+# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
+# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
+# Toolchains with 'img' in the name are for MIPS R6 instruction sets.
+# It is recommended to use cmake-gui application for build scripts configuration and generation:
+# 1. Run cmake-gui
+# 2. Specifiy toolchain file for cross-compiling, mips32r5el-gnu.toolchian.cmake or mips64r6el-gnu.toolchain.cmake
+# can be selected.
+# 3. Configure and Generate makefiles.
+# 4. make -j4 & make install
+# ----------------------------------------------------------------------------------------------
+
+if(COMMAND toolchain_save_config)
+ return() # prevent recursive call
+endif()
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+ set(CMAKE_SYSTEM_PROCESSOR mips)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake")
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL mips AND NOT MIPS_IGNORE_FP)
+ set(FLOAT_ABI_SUFFIX "")
+endif()
+
+if(NOT "x${GCC_COMPILER_VERSION}" STREQUAL "x")
+ set(__GCC_VER_SUFFIX "-${GCC_COMPILER_VERSION}")
+endif()
+
+if(NOT DEFINED CMAKE_C_COMPILER)
+ find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-gcc${__GCC_VER_SUFFIX})
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+ find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-g++${__GCC_VER_SUFFIX})
+endif()
+if(NOT DEFINED CMAKE_LINKER)
+ find_program(CMAKE_LINKER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld)
+endif()
+if(NOT DEFINED CMAKE_AR)
+ find_program(CMAKE_AR NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar)
+endif()
+
+if(NOT DEFINED MIPS_LINUX_SYSROOT AND DEFINED GNU_MACHINE)
+ set(MIPS_LINUX_SYSROOT /usr/bin)
+endif()
+
+if(NOT DEFINED CMAKE_CXX_FLAGS)
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "mips32r5el")
+ set(CMAKE_C_FLAGS "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "")
+ set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "")
+ set(CMAKE_CXX_FLAGS "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "")
+ set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "")
+ set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -ldl -latomic" CACHE INTERNAL "Added for mips cross build error")
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64r6el")
+ set(CMAKE_C_FLAGS "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "")
+ set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "")
+ set(CMAKE_CXX_FLAGS "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "")
+ set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "")
+ set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -ldl" CACHE INTERNAL "Added for mips cross build error")
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+ endif()
+ set(CMAKE_SHARED_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+ set(CMAKE_MODULE_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+ set(CMAKE_EXE_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${MIPS_LINUX_SYSROOT})
+
+set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS}
+ MIPS_LINUX_SYSROOT
+)
+toolchain_save_config()
--- /dev/null
+# ----------------------------------------------------------------------------------------------
+# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
+# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
+# Toolchains with 'img' in the name are for MIPS R6 instruction sets.
+# It is recommended to use cmake-gui for build scripts configuration and generation:
+# 1. Run cmake-gui
+# 2. Specifiy toolchain file mips32r5el-gnu.toolchian.cmake for cross-compiling.
+# 3. Configure and Generate makefiles.
+# 4. make -j4 & make install
+# ----------------------------------------------------------------------------------------------
+set(CMAKE_SYSTEM_PROCESSOR mips32r5el)
+set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version")
+set(GNU_MACHINE "mips-mti-linux-gnu" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake")
--- /dev/null
+# ----------------------------------------------------------------------------------------------
+# MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
+# Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
+# Toolchains with 'img' in the name are for MIPS R6 instruction sets.
+# It is recommended to use cmake-gui for build scripts configuration and generation:
+# 1. Run cmake-gui
+# 2. Specifiy toolchain file mips64r6el-gnu.toolchain.cmake for cross-compiling.
+# 3. Configure and Generate makefiles.
+# 4. make -j4 & make install
+# ----------------------------------------------------------------------------------------------
+set(CMAKE_SYSTEM_PROCESSOR mips64r6el)
+set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version")
+set(GNU_MACHINE "mips-img-linux-gnu" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake")