From d387c89e86de35fe3f12b5c9db2919bc82f90157 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 31 Mar 2022 15:01:30 +0800
Subject: [PATCH] Update loongson_intrinsics.h from v1.0.5 to v1.2.1

Bug: webm:1755

Change-Id: Ib636d2aa521332b76b6aa1b0aa0a9005aafbf32b
---
 vpx_util/loongson_intrinsics.h | 259 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 240 insertions(+), 19 deletions(-)

diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h
index a34b6e8..b8b9e6d 100644
--- a/vpx_util/loongson_intrinsics.h
+++ b/vpx_util/loongson_intrinsics.h
@@ -39,8 +39,8 @@
  * MICRO version: Comment changes or implementation changes.
  */
 #define LSOM_VERSION_MAJOR 1
-#define LSOM_VERSION_MINOR 0
-#define LSOM_VERSION_MICRO 5
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
 
 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
   {                                               \
@@ -90,8 +90,8 @@
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               Then the results plus to signed half-word elements from in_c.
+ *               each other to get a result twice the size of input. Then
+ *               the results are added to signed half-word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -116,9 +116,9 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               The results plus to signed half-word elements from in_c.
- * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -136,14 +136,40 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
 
 /*
  * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
  * Description : Dot product & addition of half-word vector elements
  * Arguments   : Inputs  - in_c, in_h, in_l
  *               Outputs - out
  *               Return Type - __m128i
  * Details     : Signed half-word elements from in_h are multiplied by
  *               signed half-word elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
- *               Then the results plus to signed word elements from in_c.
+ *               each other to get a result twice the size of input.
+ *               Then the results are added to signed word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  *        in_c : 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8
@@ -168,7 +194,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -191,7 +217,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
@@ -214,7 +240,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
@@ -237,7 +263,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
  *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
- *               each other to get results with the twice size of input.
+ *               each other to get a result twice the size of input.
  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1
@@ -254,6 +280,29 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
 
 /*
  * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - double
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
+ *        in_h : 1,2,3,4
+ *        in_l : 8,7,6,5
+ *         out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_d_w(in_h, in_l);
+  out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
  * Description : Clip all halfword elements of input vector between min & max
  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
  *               (_in))
@@ -679,6 +728,132 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) {
     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
   }
 
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in15;
+ *              _out1 = _in1 + _in14;
+ *              _out2 = _in2 + _in13;
+ *              _out3 = _in3 + _in12;
+ *              _out4 = _in4 + _in11;
+ *              _out5 = _in5 + _in10;
+ *              _out6 = _in6 + _in9;
+ *              _out7 = _in7 + _in8;
+ *              _out8 = _in7 - _in8;
+ *              _out9 = _in6 - _in9;
+ *              _out10 = _in5 - _in10;
+ *              _out11 = _in4 - _in11;
+ *              _out12 = _in3 - _in12;
+ *              _out13 = _in2 - _in13;
+ *              _out14 = _in1 - _in14;
+ *              _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_b(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_b(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_b(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_b(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_b(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_b(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_b(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_b(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_b(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_b(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_b(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_b(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_b(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_b(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_b(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_h(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_h(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_h(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_h(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_h(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_h(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_h(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_h(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_h(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_h(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_h(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_h(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_h(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_h(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_h(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_w(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_w(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_w(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_w(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_w(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_w(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_w(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_w(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_w(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_w(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_w(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_w(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_w(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_w(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_w(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_d(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_d(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_d(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_d(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_d(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_d(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_d(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_d(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_d(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_d(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_d(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_d(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_d(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_d(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_d(_in0, _in15);                                        \
+  }
+
 #endif  // LSX
 
 #ifdef __loongarch_asx
@@ -692,7 +867,7 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) {
  * Details     : Unsigned byte elements from in_h are multiplied with
  *               unsigned byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -714,7 +889,7 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplication results of adjacent odd-even elements
+ *               Then these multiplication results of adjacent odd-even elements
  *               are added to the out vector
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -736,7 +911,7 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
  * Details     : Signed halfword elements from in_h are multiplied with
  *               signed halfword elements from in_l producing a result
  *               twice the size of input i.e. signed word.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector.
  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -761,7 +936,7 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
  * Details     : Signed word elements from in_h are multiplied with
  *               signed word elements from in_l producing a result
  *               twice the size of input i.e. signed double-word.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the out vector.
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
@@ -805,7 +980,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this multiplied results of adjacent odd-even elements
+ *               Then these multiplied results of adjacent odd-even elements
  *               are added to the in_c vector.
  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  * =============================================================================
@@ -821,6 +996,52 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
 
 /*
  * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
  * Description : Dot product of halfword vector elements
  * Arguments   : Inputs - in_c, in_h, in_l
  *               Output - out
@@ -955,7 +1176,7 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
  * Details     : Signed halfword elements from in_h are multiplied with
  *               signed halfword elements from in_l producing a result
  *               four times the size of input i.e. signed doubleword.
- *               Then this multiplication results of four adjacent elements
+ *               Then these multiplication results of four adjacent elements
  *               are added together and stored to the out vector.
  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
-- 
2.7.4