2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
12 #define VPX_DSP_MIPS_INV_TXFM_MSA_H_
14 #include "vpx_dsp/mips/macros_msa.h"
15 #include "vpx_dsp/mips/txfm_macros_msa.h"
16 #include "vpx_dsp/txfm_common.h"
18 #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
19 out0, out1, out2, out3, out4, out5, out6, out7) { \
20 v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
21 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
22 v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
23 cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
24 v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \
25 -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \
27 SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
29 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
30 SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
32 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
34 ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
35 ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
36 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
37 cnst1_m, cnst2_m, cnst3_m, in7, in0, \
40 SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
42 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
43 SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
45 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
47 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
48 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
50 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
51 cnst1_m, cnst2_m, cnst3_m, in5, in2, \
53 BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
57 SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \
58 cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
60 ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
61 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
64 ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
65 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
66 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
67 cnst2_m, cnst3_m, cnst1_m, out1, out6, \
70 SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
71 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
73 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
74 ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
75 out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
76 out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
77 out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
78 out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
85 #define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \
86 v8i16 out0_m, r0_m, r1_m; \
88 r0_m = __msa_fill_h(c0_h); \
89 r1_m = __msa_fill_h(c1_h); \
90 out0_m = __msa_ilvev_h(r1_m, r0_m); \
95 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \
96 uint8_t *dst_m = (uint8_t *) (dst); \
97 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
98 v16i8 tmp0_m, tmp1_m; \
99 v16i8 zero_m = { 0 }; \
100 v8i16 res0_m, res1_m, res2_m, res3_m; \
102 LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
103 ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \
104 zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \
105 ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \
106 res0_m, res1_m, res2_m, res3_m); \
107 CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
108 PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
109 ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
112 #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
113 v8i16 c0_m, c1_m, c2_m, c3_m; \
114 v8i16 step0_m, step1_m; \
115 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
117 c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
118 c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
119 step0_m = __msa_ilvr_h(in2, in0); \
120 DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
122 c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
123 c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
124 step1_m = __msa_ilvr_h(in3, in1); \
125 DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
126 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
128 PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
129 SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
130 BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \
131 (v8i16)tmp2_m, (v8i16)tmp3_m, \
132 out0, out1, out2, out3); \
135 #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
136 v8i16 res0_m, res1_m, c0_m, c1_m; \
137 v8i16 k1_m, k2_m, k3_m, k4_m; \
138 v8i16 zero_m = { 0 }; \
139 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
140 v4i32 int0_m, int1_m, int2_m, int3_m; \
141 v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \
142 sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \
145 SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
146 ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
147 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
148 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
149 int0_m = tmp2_m + tmp1_m; \
151 SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
152 ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
153 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
154 int1_m = tmp0_m + tmp1_m; \
156 c0_m = __msa_splati_h(mask_m, 6); \
157 ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
158 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
159 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
160 int2_m = tmp0_m + tmp1_m; \
162 c0_m = __msa_splati_h(mask_m, 6); \
163 c0_m = __msa_ilvev_h(c0_m, k1_m); \
165 res0_m = __msa_ilvr_h((in1), (in3)); \
166 tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
167 int3_m = tmp2_m + tmp0_m; \
169 res0_m = __msa_ilvr_h((in2), (in3)); \
170 c1_m = __msa_ilvev_h(k4_m, k3_m); \
172 tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
173 res1_m = __msa_ilvr_h((in0), (in2)); \
174 c1_m = __msa_ilvev_h(k1_m, zero_m); \
176 tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
180 SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
181 PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
182 PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
185 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \
188 SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
189 c0_m = __msa_ilvev_h(c1_m, c0_m); \
194 /* multiply and add macro */
195 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
196 out0, out1, out2, out3) { \
197 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
198 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
200 ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
201 ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
202 DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \
203 cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
204 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
205 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
206 DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \
207 cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
208 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
209 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
213 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
214 out0, out1, out2, out3, out4, out5, out6, out7) { \
215 v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
216 v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
217 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
218 v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
219 cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
221 k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
222 k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
223 k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
224 k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
225 VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
226 SUB2(in1, in3, in7, in5, res0_m, res1_m); \
227 k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
228 k1_m = __msa_splati_h(mask_m, 4); \
230 ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
231 DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
232 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
233 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
235 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
237 k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
238 k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
239 VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \
240 in0, in4, in2, in6); \
241 BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
242 BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \
243 out0, out1, out2, out3, out4, out5, out6, out7); \
246 #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
247 out0, out1, out2, out3, out4, out5, out6, out7) { \
248 v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
249 v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
250 v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
251 v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \
252 cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
253 v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \
254 cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
255 v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \
256 -cospi_16_64, 0, 0, 0, 0 }; \
258 k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
259 k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
260 ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
261 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
262 r0_m, r1_m, r2_m, r3_m); \
263 k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
264 k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
265 ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
266 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
267 r4_m, r5_m, r6_m, r7_m); \
268 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
269 m0_m, m1_m, m2_m, m3_m); \
270 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
271 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
272 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
273 m0_m, m1_m, m2_m, m3_m); \
274 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
275 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
276 k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
277 k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
278 ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
279 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
280 r0_m, r1_m, r2_m, r3_m); \
281 k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
282 k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
283 ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
284 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
285 r4_m, r5_m, r6_m, r7_m); \
286 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
287 m0_m, m1_m, m2_m, m3_m); \
288 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
289 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
290 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
291 m0_m, m1_m, m2_m, m3_m); \
292 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
293 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
294 ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
295 BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
296 k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
297 k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
298 ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
299 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
300 r0_m, r1_m, r2_m, r3_m); \
301 k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
302 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
303 r4_m, r5_m, r6_m, r7_m); \
304 ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
305 m0_m, m1_m, m2_m, m3_m); \
306 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
307 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
308 SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
309 m0_m, m1_m, m2_m, m3_m); \
310 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
311 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
312 k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
313 k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
314 ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
315 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
316 m0_m, m1_m, m2_m, m3_m); \
317 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
318 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
319 ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
320 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
321 m0_m, m1_m, m2_m, m3_m); \
322 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
323 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
331 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \
332 r9, r10, r11, r12, r13, r14, r15, \
333 out0, out1, out2, out3, out4, out5, \
334 out6, out7, out8, out9, out10, out11, \
335 out12, out13, out14, out15) { \
336 v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
337 v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
338 v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
339 v8i16 h8_m, h9_m, h10_m, h11_m; \
340 v8i16 k0_m, k1_m, k2_m, k3_m; \
343 k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
344 k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
345 k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
346 k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
347 MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
348 g0_m, g1_m, g2_m, g3_m); \
349 k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
350 k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
351 k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
352 k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
353 MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
354 g4_m, g5_m, g6_m, g7_m); \
355 k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
356 k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
357 k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
358 k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
359 MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
360 g8_m, g9_m, g10_m, g11_m); \
361 k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
362 k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
363 k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
364 k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
365 MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
366 g12_m, g13_m, g14_m, g15_m); \
369 k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
370 k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
371 k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
372 MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
373 h0_m, h1_m, h2_m, h3_m); \
374 k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
375 k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
376 k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
377 MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
378 h4_m, h5_m, h6_m, h7_m); \
379 BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
380 BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
381 h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
384 BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
385 k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
386 k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
387 k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
388 MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
389 out4, out6, out5, out7); \
390 MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
391 out12, out14, out13, out15); \
394 k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
395 k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
396 k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
397 k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
398 MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
399 MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
400 MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
401 MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
404 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
406 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
407 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
409 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
410 #endif // VPX_DSP_MIPS_INV_TXFM_MSA_H_