2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
13 #define RECON_AND_STORE4X4(dest, in_x) \
15 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
16 d0 = _mm_unpacklo_epi8(d0, zero); \
17 d0 = _mm_add_epi16(in_x, d0); \
18 d0 = _mm_packus_epi16(d0, d0); \
19 *(int *)dest = _mm_cvtsi128_si32(d0); \
23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
24 const __m128i zero = _mm_setzero_si128();
25 const __m128i eight = _mm_set1_epi16(8);
26 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
27 (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
28 (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
29 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
31 __m128i input0, input1, input2, input3;
34 input0 = _mm_load_si128((const __m128i *)input);
35 input2 = _mm_load_si128((const __m128i *)(input + 8));
37 // Construct i3, i1, i3, i1, i2, i0, i2, i0
38 input0 = _mm_shufflelo_epi16(input0, 0xd8);
39 input0 = _mm_shufflehi_epi16(input0, 0xd8);
40 input2 = _mm_shufflelo_epi16(input2, 0xd8);
41 input2 = _mm_shufflehi_epi16(input2, 0xd8);
43 input1 = _mm_unpackhi_epi32(input0, input0);
44 input0 = _mm_unpacklo_epi32(input0, input0);
45 input3 = _mm_unpackhi_epi32(input2, input2);
46 input2 = _mm_unpacklo_epi32(input2, input2);
49 input0 = _mm_madd_epi16(input0, cst);
50 input1 = _mm_madd_epi16(input1, cst);
51 input2 = _mm_madd_epi16(input2, cst);
52 input3 = _mm_madd_epi16(input3, cst);
54 input0 = _mm_add_epi32(input0, rounding);
55 input1 = _mm_add_epi32(input1, rounding);
56 input2 = _mm_add_epi32(input2, rounding);
57 input3 = _mm_add_epi32(input3, rounding);
59 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
60 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
61 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
62 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65 input0 = _mm_packs_epi32(input0, input1);
66 input1 = _mm_packs_epi32(input2, input3);
69 input2 = _mm_unpacklo_epi16(input0, input1);
70 input3 = _mm_unpackhi_epi16(input0, input1);
71 input0 = _mm_unpacklo_epi32(input2, input3);
72 input1 = _mm_unpackhi_epi32(input2, input3);
74 // Switch column2, column 3, and then, we got:
75 // input2: column1, column 0; input3: column2, column 3.
76 input1 = _mm_shuffle_epi32(input1, 0x4e);
77 input2 = _mm_add_epi16(input0, input1);
78 input3 = _mm_sub_epi16(input0, input1);
81 // Construct i3, i1, i3, i1, i2, i0, i2, i0
82 input0 = _mm_unpacklo_epi32(input2, input2);
83 input1 = _mm_unpackhi_epi32(input2, input2);
84 input2 = _mm_unpackhi_epi32(input3, input3);
85 input3 = _mm_unpacklo_epi32(input3, input3);
88 input0 = _mm_madd_epi16(input0, cst);
89 input1 = _mm_madd_epi16(input1, cst);
90 input2 = _mm_madd_epi16(input2, cst);
91 input3 = _mm_madd_epi16(input3, cst);
93 input0 = _mm_add_epi32(input0, rounding);
94 input1 = _mm_add_epi32(input1, rounding);
95 input2 = _mm_add_epi32(input2, rounding);
96 input3 = _mm_add_epi32(input3, rounding);
98 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
99 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
100 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
101 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104 input0 = _mm_packs_epi32(input0, input2);
105 input1 = _mm_packs_epi32(input1, input3);
108 input2 = _mm_unpacklo_epi16(input0, input1);
109 input3 = _mm_unpackhi_epi16(input0, input1);
110 input0 = _mm_unpacklo_epi32(input2, input3);
111 input1 = _mm_unpackhi_epi32(input2, input3);
113 // Switch column2, column 3, and then, we got:
114 // input2: column1, column 0; input3: column2, column 3.
115 input1 = _mm_shuffle_epi32(input1, 0x4e);
116 input2 = _mm_add_epi16(input0, input1);
117 input3 = _mm_sub_epi16(input0, input1);
119 // Final round and shift
120 input2 = _mm_add_epi16(input2, eight);
121 input3 = _mm_add_epi16(input3, eight);
123 input2 = _mm_srai_epi16(input2, 4);
124 input3 = _mm_srai_epi16(input3, 4);
126 // Reconstruction and Store
128 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
129 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
130 d0 = _mm_unpacklo_epi32(d0,
131 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
132 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
133 *(const int *) (dest + stride * 3)), d2);
134 d0 = _mm_unpacklo_epi8(d0, zero);
135 d2 = _mm_unpacklo_epi8(d2, zero);
136 d0 = _mm_add_epi16(d0, input2);
137 d2 = _mm_add_epi16(d2, input3);
138 d0 = _mm_packus_epi16(d0, d2);
140 *(int *)dest = _mm_cvtsi128_si32(d0);
142 d0 = _mm_srli_si128(d0, 4);
143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
145 d0 = _mm_srli_si128(d0, 4);
146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
148 d0 = _mm_srli_si128(d0, 4);
149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
155 const __m128i zero = _mm_setzero_si128();
158 a = dct_const_round_shift(input[0] * cospi_16_64);
159 a = dct_const_round_shift(a * cospi_16_64);
160 a = ROUND_POWER_OF_TWO(a, 4);
162 dc_value = _mm_set1_epi16(a);
164 RECON_AND_STORE4X4(dest, dc_value);
165 RECON_AND_STORE4X4(dest, dc_value);
166 RECON_AND_STORE4X4(dest, dc_value);
167 RECON_AND_STORE4X4(dest, dc_value);
170 static INLINE void transpose_4x4(__m128i *res) {
171 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
172 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
174 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
175 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
178 static void idct4_sse2(__m128i *in) {
179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
188 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
189 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
190 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
191 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
192 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
193 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
195 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
196 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
197 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
198 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
200 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
201 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
202 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
203 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
205 u[0] = _mm_packs_epi32(v[0], v[1]);
206 u[1] = _mm_packs_epi32(v[3], v[2]);
209 in[0] = _mm_add_epi16(u[0], u[1]);
210 in[1] = _mm_sub_epi16(u[0], u[1]);
211 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
214 static void iadst4_sse2(__m128i *in) {
215 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
216 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
217 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
218 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
219 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
220 const __m128i kZero = _mm_set1_epi16(0);
221 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
222 __m128i u[8], v[8], in7;
225 in7 = _mm_srli_si128(in[1], 8);
226 in7 = _mm_add_epi16(in7, in[0]);
227 in7 = _mm_sub_epi16(in7, in[1]);
229 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
230 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
231 u[2] = _mm_unpacklo_epi16(in7, kZero);
232 u[3] = _mm_unpackhi_epi16(in[0], kZero);
234 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
235 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
236 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
237 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
238 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
239 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
241 u[0] = _mm_add_epi32(v[0], v[1]);
242 u[1] = _mm_add_epi32(v[3], v[4]);
244 u[3] = _mm_add_epi32(u[0], u[1]);
245 u[4] = _mm_slli_epi32(v[5], 2);
246 u[5] = _mm_add_epi32(u[3], v[5]);
247 u[6] = _mm_sub_epi32(u[5], u[4]);
249 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
250 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
251 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
252 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
254 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
255 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
256 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
257 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
259 in[0] = _mm_packs_epi32(u[0], u[1]);
260 in[1] = _mm_packs_epi32(u[2], u[3]);
263 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
266 const __m128i zero = _mm_setzero_si128();
267 const __m128i eight = _mm_set1_epi16(8);
269 in[0]= _mm_loadu_si128((const __m128i *)(input));
270 in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
294 // Final round and shift
295 in[0] = _mm_add_epi16(in[0], eight);
296 in[1] = _mm_add_epi16(in[1], eight);
298 in[0] = _mm_srai_epi16(in[0], 4);
299 in[1] = _mm_srai_epi16(in[1], 4);
301 // Reconstruction and Store
303 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
304 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
305 d0 = _mm_unpacklo_epi32(d0,
306 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
307 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
308 *(const int *) (dest + stride * 3)));
309 d0 = _mm_unpacklo_epi8(d0, zero);
310 d2 = _mm_unpacklo_epi8(d2, zero);
311 d0 = _mm_add_epi16(d0, in[0]);
312 d2 = _mm_add_epi16(d2, in[1]);
313 d0 = _mm_packus_epi16(d0, d2);
315 *(int *)dest = _mm_cvtsi128_si32(d0);
317 d0 = _mm_srli_si128(d0, 4);
318 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
320 d0 = _mm_srli_si128(d0, 4);
321 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
323 d0 = _mm_srli_si128(d0, 4);
324 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
328 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
329 out0, out1, out2, out3, out4, out5, out6, out7) \
331 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
332 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
333 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
334 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
335 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
336 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
337 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
338 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
340 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
341 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
342 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
343 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
344 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
345 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
346 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
347 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
349 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
350 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
351 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
352 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
353 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
354 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
355 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
356 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
359 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
360 out0, out1, out2, out3) \
362 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
363 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
364 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
365 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
367 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
368 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
369 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
370 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
372 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
373 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
374 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
375 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
378 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
380 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
381 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
382 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
383 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
386 // Define Macro for multiplying elements by constants and adding them together.
387 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
388 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
390 tmp0 = _mm_madd_epi16(lo_0, cst0); \
391 tmp1 = _mm_madd_epi16(hi_0, cst0); \
392 tmp2 = _mm_madd_epi16(lo_0, cst1); \
393 tmp3 = _mm_madd_epi16(hi_0, cst1); \
394 tmp4 = _mm_madd_epi16(lo_1, cst2); \
395 tmp5 = _mm_madd_epi16(hi_1, cst2); \
396 tmp6 = _mm_madd_epi16(lo_1, cst3); \
397 tmp7 = _mm_madd_epi16(hi_1, cst3); \
399 tmp0 = _mm_add_epi32(tmp0, rounding); \
400 tmp1 = _mm_add_epi32(tmp1, rounding); \
401 tmp2 = _mm_add_epi32(tmp2, rounding); \
402 tmp3 = _mm_add_epi32(tmp3, rounding); \
403 tmp4 = _mm_add_epi32(tmp4, rounding); \
404 tmp5 = _mm_add_epi32(tmp5, rounding); \
405 tmp6 = _mm_add_epi32(tmp6, rounding); \
406 tmp7 = _mm_add_epi32(tmp7, rounding); \
408 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
409 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
410 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
411 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
412 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
413 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
414 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
415 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
417 res0 = _mm_packs_epi32(tmp0, tmp1); \
418 res1 = _mm_packs_epi32(tmp2, tmp3); \
419 res2 = _mm_packs_epi32(tmp4, tmp5); \
420 res3 = _mm_packs_epi32(tmp6, tmp7); \
423 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
425 tmp0 = _mm_madd_epi16(lo_0, cst0); \
426 tmp1 = _mm_madd_epi16(hi_0, cst0); \
427 tmp2 = _mm_madd_epi16(lo_0, cst1); \
428 tmp3 = _mm_madd_epi16(hi_0, cst1); \
430 tmp0 = _mm_add_epi32(tmp0, rounding); \
431 tmp1 = _mm_add_epi32(tmp1, rounding); \
432 tmp2 = _mm_add_epi32(tmp2, rounding); \
433 tmp3 = _mm_add_epi32(tmp3, rounding); \
435 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
436 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
437 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
438 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
440 res0 = _mm_packs_epi32(tmp0, tmp1); \
441 res1 = _mm_packs_epi32(tmp2, tmp3); \
444 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
445 out0, out1, out2, out3, out4, out5, out6, out7) \
449 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
450 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
451 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
452 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
454 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
455 stg1_1, stg1_2, stg1_3, stp1_4, \
456 stp1_7, stp1_5, stp1_6) \
461 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
462 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
463 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
464 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
466 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
467 stg2_1, stg2_2, stg2_3, stp2_0, \
468 stp2_1, stp2_2, stp2_3) \
470 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
471 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
472 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
473 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
478 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
479 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
481 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
482 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
483 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
484 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
486 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
487 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
488 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
489 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
491 tmp0 = _mm_add_epi32(tmp0, rounding); \
492 tmp1 = _mm_add_epi32(tmp1, rounding); \
493 tmp2 = _mm_add_epi32(tmp2, rounding); \
494 tmp3 = _mm_add_epi32(tmp3, rounding); \
496 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
497 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
498 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
499 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
501 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
502 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
506 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
507 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
508 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
516 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
517 const __m128i zero = _mm_setzero_si128();
518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
519 const __m128i final_rounding = _mm_set1_epi16(1<<4);
520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
526 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
527 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
529 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
530 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
531 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
532 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
536 in0 = _mm_load_si128((const __m128i *)input);
537 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
538 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
539 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
540 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
541 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
542 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
543 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
546 for (i = 0; i < 2; i++) {
547 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
548 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
549 in0, in1, in2, in3, in4, in5, in6, in7);
551 // 4-stage 1D idct8x8
552 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
553 in0, in1, in2, in3, in4, in5, in6, in7);
556 // Final rounding and shift
557 in0 = _mm_adds_epi16(in0, final_rounding);
558 in1 = _mm_adds_epi16(in1, final_rounding);
559 in2 = _mm_adds_epi16(in2, final_rounding);
560 in3 = _mm_adds_epi16(in3, final_rounding);
561 in4 = _mm_adds_epi16(in4, final_rounding);
562 in5 = _mm_adds_epi16(in5, final_rounding);
563 in6 = _mm_adds_epi16(in6, final_rounding);
564 in7 = _mm_adds_epi16(in7, final_rounding);
566 in0 = _mm_srai_epi16(in0, 5);
567 in1 = _mm_srai_epi16(in1, 5);
568 in2 = _mm_srai_epi16(in2, 5);
569 in3 = _mm_srai_epi16(in3, 5);
570 in4 = _mm_srai_epi16(in4, 5);
571 in5 = _mm_srai_epi16(in5, 5);
572 in6 = _mm_srai_epi16(in6, 5);
573 in7 = _mm_srai_epi16(in7, 5);
575 RECON_AND_STORE(dest, in0);
576 RECON_AND_STORE(dest, in1);
577 RECON_AND_STORE(dest, in2);
578 RECON_AND_STORE(dest, in3);
579 RECON_AND_STORE(dest, in4);
580 RECON_AND_STORE(dest, in5);
581 RECON_AND_STORE(dest, in6);
582 RECON_AND_STORE(dest, in7);
585 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
587 const __m128i zero = _mm_setzero_si128();
590 a = dct_const_round_shift(input[0] * cospi_16_64);
591 a = dct_const_round_shift(a * cospi_16_64);
592 a = ROUND_POWER_OF_TWO(a, 5);
594 dc_value = _mm_set1_epi16(a);
596 RECON_AND_STORE(dest, dc_value);
597 RECON_AND_STORE(dest, dc_value);
598 RECON_AND_STORE(dest, dc_value);
599 RECON_AND_STORE(dest, dc_value);
600 RECON_AND_STORE(dest, dc_value);
601 RECON_AND_STORE(dest, dc_value);
602 RECON_AND_STORE(dest, dc_value);
603 RECON_AND_STORE(dest, dc_value);
606 static void idct8_sse2(__m128i *in) {
607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
617 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
618 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
619 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
620 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
622 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
623 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
624 in0, in1, in2, in3, in4, in5, in6, in7);
626 // 4-stage 1D idct8x8
627 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
628 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
631 static void iadst8_sse2(__m128i *in) {
632 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
633 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
634 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
635 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
636 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
637 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
638 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
639 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
640 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
641 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
642 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
643 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
644 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
645 const __m128i k__const_0 = _mm_set1_epi16(0);
646 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
648 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
649 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
650 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
651 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
652 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
655 array_transpose_8x8(in, in);
657 // properly aligned for butterfly input
667 // column transformation
669 // interleave and multiply/add into 32-bit integer
670 s0 = _mm_unpacklo_epi16(in0, in1);
671 s1 = _mm_unpackhi_epi16(in0, in1);
672 s2 = _mm_unpacklo_epi16(in2, in3);
673 s3 = _mm_unpackhi_epi16(in2, in3);
674 s4 = _mm_unpacklo_epi16(in4, in5);
675 s5 = _mm_unpackhi_epi16(in4, in5);
676 s6 = _mm_unpacklo_epi16(in6, in7);
677 s7 = _mm_unpackhi_epi16(in6, in7);
679 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
680 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
681 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
682 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
683 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
684 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
685 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
686 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
687 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
688 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
689 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
690 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
691 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
692 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
693 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
694 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
697 w0 = _mm_add_epi32(u0, u8);
698 w1 = _mm_add_epi32(u1, u9);
699 w2 = _mm_add_epi32(u2, u10);
700 w3 = _mm_add_epi32(u3, u11);
701 w4 = _mm_add_epi32(u4, u12);
702 w5 = _mm_add_epi32(u5, u13);
703 w6 = _mm_add_epi32(u6, u14);
704 w7 = _mm_add_epi32(u7, u15);
705 w8 = _mm_sub_epi32(u0, u8);
706 w9 = _mm_sub_epi32(u1, u9);
707 w10 = _mm_sub_epi32(u2, u10);
708 w11 = _mm_sub_epi32(u3, u11);
709 w12 = _mm_sub_epi32(u4, u12);
710 w13 = _mm_sub_epi32(u5, u13);
711 w14 = _mm_sub_epi32(u6, u14);
712 w15 = _mm_sub_epi32(u7, u15);
714 // shift and rounding
715 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
716 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
717 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
718 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
719 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
720 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
721 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
722 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
723 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
724 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
725 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
726 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
727 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
728 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
729 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
730 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
732 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
733 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
734 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
735 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
736 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
737 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
738 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
739 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
740 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
741 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
742 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
743 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
744 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
745 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
746 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
747 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
749 // back to 16-bit and pack 8 integers into __m128i
750 in[0] = _mm_packs_epi32(u0, u1);
751 in[1] = _mm_packs_epi32(u2, u3);
752 in[2] = _mm_packs_epi32(u4, u5);
753 in[3] = _mm_packs_epi32(u6, u7);
754 in[4] = _mm_packs_epi32(u8, u9);
755 in[5] = _mm_packs_epi32(u10, u11);
756 in[6] = _mm_packs_epi32(u12, u13);
757 in[7] = _mm_packs_epi32(u14, u15);
760 s0 = _mm_add_epi16(in[0], in[2]);
761 s1 = _mm_add_epi16(in[1], in[3]);
762 s2 = _mm_sub_epi16(in[0], in[2]);
763 s3 = _mm_sub_epi16(in[1], in[3]);
764 u0 = _mm_unpacklo_epi16(in[4], in[5]);
765 u1 = _mm_unpackhi_epi16(in[4], in[5]);
766 u2 = _mm_unpacklo_epi16(in[6], in[7]);
767 u3 = _mm_unpackhi_epi16(in[6], in[7]);
769 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
770 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
771 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
772 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
773 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
774 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
775 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
776 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
778 w0 = _mm_add_epi32(v0, v4);
779 w1 = _mm_add_epi32(v1, v5);
780 w2 = _mm_add_epi32(v2, v6);
781 w3 = _mm_add_epi32(v3, v7);
782 w4 = _mm_sub_epi32(v0, v4);
783 w5 = _mm_sub_epi32(v1, v5);
784 w6 = _mm_sub_epi32(v2, v6);
785 w7 = _mm_sub_epi32(v3, v7);
787 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
788 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
789 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
790 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
791 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
792 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
793 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
794 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
796 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
797 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
798 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
799 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
800 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
801 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
802 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
803 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
805 // back to 16-bit intergers
806 s4 = _mm_packs_epi32(u0, u1);
807 s5 = _mm_packs_epi32(u2, u3);
808 s6 = _mm_packs_epi32(u4, u5);
809 s7 = _mm_packs_epi32(u6, u7);
812 u0 = _mm_unpacklo_epi16(s2, s3);
813 u1 = _mm_unpackhi_epi16(s2, s3);
814 u2 = _mm_unpacklo_epi16(s6, s7);
815 u3 = _mm_unpackhi_epi16(s6, s7);
817 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
818 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
819 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
820 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
821 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
822 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
823 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
824 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
826 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
827 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
828 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
829 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
830 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
831 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
832 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
833 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
835 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
836 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
837 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
838 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
839 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
840 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
841 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
842 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
844 s2 = _mm_packs_epi32(v0, v1);
845 s3 = _mm_packs_epi32(v2, v3);
846 s6 = _mm_packs_epi32(v4, v5);
847 s7 = _mm_packs_epi32(v6, v7);
850 in[1] = _mm_sub_epi16(k__const_0, s4);
852 in[3] = _mm_sub_epi16(k__const_0, s2);
854 in[5] = _mm_sub_epi16(k__const_0, s7);
856 in[7] = _mm_sub_epi16(k__const_0, s1);
860 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
863 const __m128i zero = _mm_setzero_si128();
864 const __m128i final_rounding = _mm_set1_epi16(1<<4);
867 in[0] = _mm_load_si128((const __m128i *)input);
868 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
869 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
870 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
871 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
872 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
873 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
874 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
898 // Final rounding and shift
899 in[0] = _mm_adds_epi16(in[0], final_rounding);
900 in[1] = _mm_adds_epi16(in[1], final_rounding);
901 in[2] = _mm_adds_epi16(in[2], final_rounding);
902 in[3] = _mm_adds_epi16(in[3], final_rounding);
903 in[4] = _mm_adds_epi16(in[4], final_rounding);
904 in[5] = _mm_adds_epi16(in[5], final_rounding);
905 in[6] = _mm_adds_epi16(in[6], final_rounding);
906 in[7] = _mm_adds_epi16(in[7], final_rounding);
908 in[0] = _mm_srai_epi16(in[0], 5);
909 in[1] = _mm_srai_epi16(in[1], 5);
910 in[2] = _mm_srai_epi16(in[2], 5);
911 in[3] = _mm_srai_epi16(in[3], 5);
912 in[4] = _mm_srai_epi16(in[4], 5);
913 in[5] = _mm_srai_epi16(in[5], 5);
914 in[6] = _mm_srai_epi16(in[6], 5);
915 in[7] = _mm_srai_epi16(in[7], 5);
917 RECON_AND_STORE(dest, in[0]);
918 RECON_AND_STORE(dest, in[1]);
919 RECON_AND_STORE(dest, in[2]);
920 RECON_AND_STORE(dest, in[3]);
921 RECON_AND_STORE(dest, in[4]);
922 RECON_AND_STORE(dest, in[5]);
923 RECON_AND_STORE(dest, in[6]);
924 RECON_AND_STORE(dest, in[7]);
927 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
928 const __m128i zero = _mm_setzero_si128();
929 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
930 const __m128i final_rounding = _mm_set1_epi16(1<<4);
931 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
932 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
933 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
934 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
935 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
936 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
937 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
938 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
939 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
941 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
942 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
943 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
944 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
946 // Rows. Load 4-row input data.
947 in0 = _mm_load_si128((const __m128i *)input);
948 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
949 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
950 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
953 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
956 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
957 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
959 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
960 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
961 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
962 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
964 tmp0 = _mm_add_epi32(tmp0, rounding);
965 tmp2 = _mm_add_epi32(tmp2, rounding);
966 tmp4 = _mm_add_epi32(tmp4, rounding);
967 tmp6 = _mm_add_epi32(tmp6, rounding);
968 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
969 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
970 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
971 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
973 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
974 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
979 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
980 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
982 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
983 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
984 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
985 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
987 tmp0 = _mm_add_epi32(tmp0, rounding);
988 tmp2 = _mm_add_epi32(tmp2, rounding);
989 tmp4 = _mm_add_epi32(tmp4, rounding);
990 tmp6 = _mm_add_epi32(tmp6, rounding);
991 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
992 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
993 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
994 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
996 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
997 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
999 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1000 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1003 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1004 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1009 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1011 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1012 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1014 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1015 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1017 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1018 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
1020 tmp0 = _mm_add_epi32(tmp0, rounding);
1021 tmp2 = _mm_add_epi32(tmp2, rounding);
1022 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1023 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1025 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1029 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1030 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1031 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1032 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1034 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1036 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1037 in0, in1, in2, in3, in4, in5, in6, in7);
1038 // Final rounding and shift
1039 in0 = _mm_adds_epi16(in0, final_rounding);
1040 in1 = _mm_adds_epi16(in1, final_rounding);
1041 in2 = _mm_adds_epi16(in2, final_rounding);
1042 in3 = _mm_adds_epi16(in3, final_rounding);
1043 in4 = _mm_adds_epi16(in4, final_rounding);
1044 in5 = _mm_adds_epi16(in5, final_rounding);
1045 in6 = _mm_adds_epi16(in6, final_rounding);
1046 in7 = _mm_adds_epi16(in7, final_rounding);
1048 in0 = _mm_srai_epi16(in0, 5);
1049 in1 = _mm_srai_epi16(in1, 5);
1050 in2 = _mm_srai_epi16(in2, 5);
1051 in3 = _mm_srai_epi16(in3, 5);
1052 in4 = _mm_srai_epi16(in4, 5);
1053 in5 = _mm_srai_epi16(in5, 5);
1054 in6 = _mm_srai_epi16(in6, 5);
1055 in7 = _mm_srai_epi16(in7, 5);
1057 RECON_AND_STORE(dest, in0);
1058 RECON_AND_STORE(dest, in1);
1059 RECON_AND_STORE(dest, in2);
1060 RECON_AND_STORE(dest, in3);
1061 RECON_AND_STORE(dest, in4);
1062 RECON_AND_STORE(dest, in5);
1063 RECON_AND_STORE(dest, in6);
1064 RECON_AND_STORE(dest, in7);
1070 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1071 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1072 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
1073 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
1074 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1075 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1076 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1077 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1079 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1080 stg2_0, stg2_1, stg2_2, stg2_3, \
1081 stp2_8, stp2_15, stp2_9, stp2_14) \
1083 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1084 stg2_4, stg2_5, stg2_6, stg2_7, \
1085 stp2_10, stp2_13, stp2_11, stp2_12) \
1090 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1091 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1092 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1093 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1095 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1096 stg3_0, stg3_1, stg3_2, stg3_3, \
1097 stp1_4, stp1_7, stp1_5, stp1_6) \
1099 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
1100 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
1101 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1102 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1104 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1105 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1106 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1107 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1112 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1113 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1114 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1115 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1117 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1118 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1119 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1120 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1122 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1123 stg4_0, stg4_1, stg4_2, stg4_3, \
1124 stp2_0, stp2_1, stp2_2, stp2_3) \
1126 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1127 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1128 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1129 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1131 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1132 stg4_4, stg4_5, stg4_6, stg4_7, \
1133 stp2_9, stp2_14, stp2_10, stp2_13) \
1138 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1139 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1141 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1142 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1143 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1144 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1146 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1147 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1148 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1149 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1151 tmp0 = _mm_add_epi32(tmp0, rounding); \
1152 tmp1 = _mm_add_epi32(tmp1, rounding); \
1153 tmp2 = _mm_add_epi32(tmp2, rounding); \
1154 tmp3 = _mm_add_epi32(tmp3, rounding); \
1156 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1157 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1158 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1159 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1161 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1162 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1164 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1165 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1166 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1167 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1169 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1170 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1171 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1172 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1177 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1178 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1179 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1180 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1182 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1183 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1184 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1185 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1186 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1187 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1188 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1189 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1191 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1192 stg6_0, stg4_0, stg6_0, stg4_0, \
1193 stp2_10, stp2_13, stp2_11, stp2_12) \
1199 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1200 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1201 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1202 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1204 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1205 stg2_0, stg2_1, stg2_6, stg2_7, \
1206 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1211 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1212 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1214 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1218 stp1_9 = stp1_8_0; \
1219 stp1_10 = stp1_11; \
1221 stp1_13 = stp1_12_0; \
1222 stp1_14 = stp1_15; \
1227 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1228 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1230 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1231 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1232 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1233 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1235 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1241 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1242 stg4_4, stg4_5, stg4_6, stg4_7, \
1243 stp2_9, stp2_14, stp2_10, stp2_13) \
1248 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1249 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1254 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1255 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1256 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1257 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1259 tmp0 = _mm_add_epi32(tmp0, rounding); \
1260 tmp1 = _mm_add_epi32(tmp1, rounding); \
1261 tmp2 = _mm_add_epi32(tmp2, rounding); \
1262 tmp3 = _mm_add_epi32(tmp3, rounding); \
1264 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1265 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1266 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1267 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1269 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1270 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1272 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1273 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1274 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1275 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1277 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1278 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1279 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1280 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1285 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1286 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1287 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1288 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1290 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1291 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1292 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1293 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1294 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1295 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1296 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1297 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1299 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1300 stg6_0, stg4_0, stg6_0, stg4_0, \
1301 stp2_10, stp2_13, stp2_11, stp2_12) \
1304 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1306 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1307 const __m128i final_rounding = _mm_set1_epi16(1<<5);
1308 const __m128i zero = _mm_setzero_si128();
1310 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1311 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1312 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1313 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1314 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1315 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1316 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1317 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1319 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1320 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1321 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1322 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1324 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1325 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1326 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1327 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1328 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1329 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1330 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1331 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1333 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1335 __m128i in[16], l[16], r[16], *curr1;
1336 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1337 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1338 stp1_8_0, stp1_12_0;
1339 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1340 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1341 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1345 for (i = 0; i < 2; i++) {
1349 in[0] = _mm_load_si128((const __m128i *)input);
1350 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1351 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1352 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1353 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1354 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1355 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1356 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1357 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1358 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1359 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1360 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1361 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1362 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1363 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1364 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1366 array_transpose_8x8(in, in);
1367 array_transpose_8x8(in+8, in+8);
1372 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1373 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1374 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1375 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1376 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1377 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1378 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1379 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1380 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1381 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1382 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1383 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1384 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1385 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1386 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1387 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1392 for (i = 0; i < 2; i++) {
1394 array_transpose_8x8(l+i*8, in);
1395 array_transpose_8x8(r+i*8, in+8);
1400 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1401 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1402 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1403 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1404 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1405 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1406 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1407 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1408 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1409 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1410 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1411 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1412 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1413 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1414 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1415 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1417 // Final rounding and shift
1418 in[0] = _mm_adds_epi16(in[0], final_rounding);
1419 in[1] = _mm_adds_epi16(in[1], final_rounding);
1420 in[2] = _mm_adds_epi16(in[2], final_rounding);
1421 in[3] = _mm_adds_epi16(in[3], final_rounding);
1422 in[4] = _mm_adds_epi16(in[4], final_rounding);
1423 in[5] = _mm_adds_epi16(in[5], final_rounding);
1424 in[6] = _mm_adds_epi16(in[6], final_rounding);
1425 in[7] = _mm_adds_epi16(in[7], final_rounding);
1426 in[8] = _mm_adds_epi16(in[8], final_rounding);
1427 in[9] = _mm_adds_epi16(in[9], final_rounding);
1428 in[10] = _mm_adds_epi16(in[10], final_rounding);
1429 in[11] = _mm_adds_epi16(in[11], final_rounding);
1430 in[12] = _mm_adds_epi16(in[12], final_rounding);
1431 in[13] = _mm_adds_epi16(in[13], final_rounding);
1432 in[14] = _mm_adds_epi16(in[14], final_rounding);
1433 in[15] = _mm_adds_epi16(in[15], final_rounding);
1435 in[0] = _mm_srai_epi16(in[0], 6);
1436 in[1] = _mm_srai_epi16(in[1], 6);
1437 in[2] = _mm_srai_epi16(in[2], 6);
1438 in[3] = _mm_srai_epi16(in[3], 6);
1439 in[4] = _mm_srai_epi16(in[4], 6);
1440 in[5] = _mm_srai_epi16(in[5], 6);
1441 in[6] = _mm_srai_epi16(in[6], 6);
1442 in[7] = _mm_srai_epi16(in[7], 6);
1443 in[8] = _mm_srai_epi16(in[8], 6);
1444 in[9] = _mm_srai_epi16(in[9], 6);
1445 in[10] = _mm_srai_epi16(in[10], 6);
1446 in[11] = _mm_srai_epi16(in[11], 6);
1447 in[12] = _mm_srai_epi16(in[12], 6);
1448 in[13] = _mm_srai_epi16(in[13], 6);
1449 in[14] = _mm_srai_epi16(in[14], 6);
1450 in[15] = _mm_srai_epi16(in[15], 6);
1452 RECON_AND_STORE(dest, in[0]);
1453 RECON_AND_STORE(dest, in[1]);
1454 RECON_AND_STORE(dest, in[2]);
1455 RECON_AND_STORE(dest, in[3]);
1456 RECON_AND_STORE(dest, in[4]);
1457 RECON_AND_STORE(dest, in[5]);
1458 RECON_AND_STORE(dest, in[6]);
1459 RECON_AND_STORE(dest, in[7]);
1460 RECON_AND_STORE(dest, in[8]);
1461 RECON_AND_STORE(dest, in[9]);
1462 RECON_AND_STORE(dest, in[10]);
1463 RECON_AND_STORE(dest, in[11]);
1464 RECON_AND_STORE(dest, in[12]);
1465 RECON_AND_STORE(dest, in[13]);
1466 RECON_AND_STORE(dest, in[14]);
1467 RECON_AND_STORE(dest, in[15]);
1469 dest += 8 - (stride * 16);
1473 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1475 const __m128i zero = _mm_setzero_si128();
1478 a = dct_const_round_shift(input[0] * cospi_16_64);
1479 a = dct_const_round_shift(a * cospi_16_64);
1480 a = ROUND_POWER_OF_TWO(a, 6);
1482 dc_value = _mm_set1_epi16(a);
1484 for (i = 0; i < 2; ++i) {
1485 RECON_AND_STORE(dest, dc_value);
1486 RECON_AND_STORE(dest, dc_value);
1487 RECON_AND_STORE(dest, dc_value);
1488 RECON_AND_STORE(dest, dc_value);
1489 RECON_AND_STORE(dest, dc_value);
1490 RECON_AND_STORE(dest, dc_value);
1491 RECON_AND_STORE(dest, dc_value);
1492 RECON_AND_STORE(dest, dc_value);
1493 RECON_AND_STORE(dest, dc_value);
1494 RECON_AND_STORE(dest, dc_value);
1495 RECON_AND_STORE(dest, dc_value);
1496 RECON_AND_STORE(dest, dc_value);
1497 RECON_AND_STORE(dest, dc_value);
1498 RECON_AND_STORE(dest, dc_value);
1499 RECON_AND_STORE(dest, dc_value);
1500 RECON_AND_STORE(dest, dc_value);
1501 dest += 8 - (stride * 16);
1505 static void iadst16_8col(__m128i *in) {
1506 // perform 16x16 1-D ADST for 8 columns
1507 __m128i s[16], x[16], u[32], v[32];
1508 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1509 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1510 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1511 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1512 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1513 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1514 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1515 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1516 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1517 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1518 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1519 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1520 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1521 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1522 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1523 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1524 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1525 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1526 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1527 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1528 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1529 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1530 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1531 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1532 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1533 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1534 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1535 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1536 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1537 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1538 const __m128i kZero = _mm_set1_epi16(0);
1540 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1541 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1542 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1543 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1544 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1545 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1546 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1547 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1548 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1549 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1550 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1551 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1552 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1553 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1554 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1555 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1557 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1558 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1559 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1560 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1561 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1562 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1563 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1564 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1565 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1566 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1567 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1568 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1569 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1570 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1571 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1572 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1573 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1574 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1575 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1576 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1577 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1578 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1579 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1580 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1581 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1582 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1583 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1584 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1585 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1586 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1587 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1588 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1590 u[0] = _mm_add_epi32(v[0], v[16]);
1591 u[1] = _mm_add_epi32(v[1], v[17]);
1592 u[2] = _mm_add_epi32(v[2], v[18]);
1593 u[3] = _mm_add_epi32(v[3], v[19]);
1594 u[4] = _mm_add_epi32(v[4], v[20]);
1595 u[5] = _mm_add_epi32(v[5], v[21]);
1596 u[6] = _mm_add_epi32(v[6], v[22]);
1597 u[7] = _mm_add_epi32(v[7], v[23]);
1598 u[8] = _mm_add_epi32(v[8], v[24]);
1599 u[9] = _mm_add_epi32(v[9], v[25]);
1600 u[10] = _mm_add_epi32(v[10], v[26]);
1601 u[11] = _mm_add_epi32(v[11], v[27]);
1602 u[12] = _mm_add_epi32(v[12], v[28]);
1603 u[13] = _mm_add_epi32(v[13], v[29]);
1604 u[14] = _mm_add_epi32(v[14], v[30]);
1605 u[15] = _mm_add_epi32(v[15], v[31]);
1606 u[16] = _mm_sub_epi32(v[0], v[16]);
1607 u[17] = _mm_sub_epi32(v[1], v[17]);
1608 u[18] = _mm_sub_epi32(v[2], v[18]);
1609 u[19] = _mm_sub_epi32(v[3], v[19]);
1610 u[20] = _mm_sub_epi32(v[4], v[20]);
1611 u[21] = _mm_sub_epi32(v[5], v[21]);
1612 u[22] = _mm_sub_epi32(v[6], v[22]);
1613 u[23] = _mm_sub_epi32(v[7], v[23]);
1614 u[24] = _mm_sub_epi32(v[8], v[24]);
1615 u[25] = _mm_sub_epi32(v[9], v[25]);
1616 u[26] = _mm_sub_epi32(v[10], v[26]);
1617 u[27] = _mm_sub_epi32(v[11], v[27]);
1618 u[28] = _mm_sub_epi32(v[12], v[28]);
1619 u[29] = _mm_sub_epi32(v[13], v[29]);
1620 u[30] = _mm_sub_epi32(v[14], v[30]);
1621 u[31] = _mm_sub_epi32(v[15], v[31]);
1623 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1624 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1625 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1626 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1627 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1628 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1629 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1630 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1631 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1632 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1633 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1634 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1635 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1636 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1637 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1638 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1639 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1640 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1641 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1642 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1643 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1644 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1645 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1646 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1647 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1648 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1649 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1650 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1651 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1652 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1653 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1654 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1656 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1657 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1658 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1659 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1660 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1661 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1662 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1663 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1664 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1665 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1666 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1667 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1668 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1669 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1670 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1671 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1672 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1673 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1674 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1675 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1676 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1677 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1678 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1679 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1680 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1681 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1682 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1683 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1684 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1685 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1686 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1687 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1689 s[0] = _mm_packs_epi32(u[0], u[1]);
1690 s[1] = _mm_packs_epi32(u[2], u[3]);
1691 s[2] = _mm_packs_epi32(u[4], u[5]);
1692 s[3] = _mm_packs_epi32(u[6], u[7]);
1693 s[4] = _mm_packs_epi32(u[8], u[9]);
1694 s[5] = _mm_packs_epi32(u[10], u[11]);
1695 s[6] = _mm_packs_epi32(u[12], u[13]);
1696 s[7] = _mm_packs_epi32(u[14], u[15]);
1697 s[8] = _mm_packs_epi32(u[16], u[17]);
1698 s[9] = _mm_packs_epi32(u[18], u[19]);
1699 s[10] = _mm_packs_epi32(u[20], u[21]);
1700 s[11] = _mm_packs_epi32(u[22], u[23]);
1701 s[12] = _mm_packs_epi32(u[24], u[25]);
1702 s[13] = _mm_packs_epi32(u[26], u[27]);
1703 s[14] = _mm_packs_epi32(u[28], u[29]);
1704 s[15] = _mm_packs_epi32(u[30], u[31]);
1707 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1708 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1709 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1710 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1711 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1712 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1713 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1714 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1716 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1717 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1718 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1719 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1720 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1721 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1722 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1723 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1724 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1725 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1726 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1727 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1728 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1729 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1730 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1731 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1733 u[0] = _mm_add_epi32(v[0], v[8]);
1734 u[1] = _mm_add_epi32(v[1], v[9]);
1735 u[2] = _mm_add_epi32(v[2], v[10]);
1736 u[3] = _mm_add_epi32(v[3], v[11]);
1737 u[4] = _mm_add_epi32(v[4], v[12]);
1738 u[5] = _mm_add_epi32(v[5], v[13]);
1739 u[6] = _mm_add_epi32(v[6], v[14]);
1740 u[7] = _mm_add_epi32(v[7], v[15]);
1741 u[8] = _mm_sub_epi32(v[0], v[8]);
1742 u[9] = _mm_sub_epi32(v[1], v[9]);
1743 u[10] = _mm_sub_epi32(v[2], v[10]);
1744 u[11] = _mm_sub_epi32(v[3], v[11]);
1745 u[12] = _mm_sub_epi32(v[4], v[12]);
1746 u[13] = _mm_sub_epi32(v[5], v[13]);
1747 u[14] = _mm_sub_epi32(v[6], v[14]);
1748 u[15] = _mm_sub_epi32(v[7], v[15]);
1750 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1751 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1752 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1753 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1754 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1755 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1756 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1757 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1758 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1759 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1760 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1761 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1762 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1763 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1764 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1765 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1767 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1768 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1769 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1770 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1771 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1772 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1773 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1774 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1775 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1776 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1777 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1778 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1779 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1780 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1781 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1782 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1784 x[0] = _mm_add_epi16(s[0], s[4]);
1785 x[1] = _mm_add_epi16(s[1], s[5]);
1786 x[2] = _mm_add_epi16(s[2], s[6]);
1787 x[3] = _mm_add_epi16(s[3], s[7]);
1788 x[4] = _mm_sub_epi16(s[0], s[4]);
1789 x[5] = _mm_sub_epi16(s[1], s[5]);
1790 x[6] = _mm_sub_epi16(s[2], s[6]);
1791 x[7] = _mm_sub_epi16(s[3], s[7]);
1792 x[8] = _mm_packs_epi32(u[0], u[1]);
1793 x[9] = _mm_packs_epi32(u[2], u[3]);
1794 x[10] = _mm_packs_epi32(u[4], u[5]);
1795 x[11] = _mm_packs_epi32(u[6], u[7]);
1796 x[12] = _mm_packs_epi32(u[8], u[9]);
1797 x[13] = _mm_packs_epi32(u[10], u[11]);
1798 x[14] = _mm_packs_epi32(u[12], u[13]);
1799 x[15] = _mm_packs_epi32(u[14], u[15]);
1802 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1803 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1804 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1805 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1806 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1807 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1808 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1809 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1811 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1812 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1813 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1814 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1815 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1816 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1817 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1818 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1819 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1820 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1821 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1822 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1823 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1824 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1825 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1826 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1828 u[0] = _mm_add_epi32(v[0], v[4]);
1829 u[1] = _mm_add_epi32(v[1], v[5]);
1830 u[2] = _mm_add_epi32(v[2], v[6]);
1831 u[3] = _mm_add_epi32(v[3], v[7]);
1832 u[4] = _mm_sub_epi32(v[0], v[4]);
1833 u[5] = _mm_sub_epi32(v[1], v[5]);
1834 u[6] = _mm_sub_epi32(v[2], v[6]);
1835 u[7] = _mm_sub_epi32(v[3], v[7]);
1836 u[8] = _mm_add_epi32(v[8], v[12]);
1837 u[9] = _mm_add_epi32(v[9], v[13]);
1838 u[10] = _mm_add_epi32(v[10], v[14]);
1839 u[11] = _mm_add_epi32(v[11], v[15]);
1840 u[12] = _mm_sub_epi32(v[8], v[12]);
1841 u[13] = _mm_sub_epi32(v[9], v[13]);
1842 u[14] = _mm_sub_epi32(v[10], v[14]);
1843 u[15] = _mm_sub_epi32(v[11], v[15]);
1845 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1846 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1847 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1848 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1849 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1850 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1851 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1852 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1853 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1854 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1855 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1856 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1857 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1858 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1859 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1860 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1862 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1863 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1864 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1865 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1866 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1867 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1868 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1869 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1870 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1871 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1872 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1873 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1874 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1875 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1876 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1877 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1879 s[0] = _mm_add_epi16(x[0], x[2]);
1880 s[1] = _mm_add_epi16(x[1], x[3]);
1881 s[2] = _mm_sub_epi16(x[0], x[2]);
1882 s[3] = _mm_sub_epi16(x[1], x[3]);
1883 s[4] = _mm_packs_epi32(v[0], v[1]);
1884 s[5] = _mm_packs_epi32(v[2], v[3]);
1885 s[6] = _mm_packs_epi32(v[4], v[5]);
1886 s[7] = _mm_packs_epi32(v[6], v[7]);
1887 s[8] = _mm_add_epi16(x[8], x[10]);
1888 s[9] = _mm_add_epi16(x[9], x[11]);
1889 s[10] = _mm_sub_epi16(x[8], x[10]);
1890 s[11] = _mm_sub_epi16(x[9], x[11]);
1891 s[12] = _mm_packs_epi32(v[8], v[9]);
1892 s[13] = _mm_packs_epi32(v[10], v[11]);
1893 s[14] = _mm_packs_epi32(v[12], v[13]);
1894 s[15] = _mm_packs_epi32(v[14], v[15]);
1897 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1898 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1899 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1900 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1901 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1902 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1903 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1904 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1906 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1907 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1908 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1909 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1910 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1911 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1912 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1913 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1914 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1915 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1916 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1917 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1918 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1919 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1920 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1921 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1923 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1924 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1925 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1926 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1927 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1928 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1929 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1930 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1931 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1932 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1933 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1934 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1935 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1936 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1937 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1938 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1940 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1941 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1942 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1943 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1944 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1945 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1946 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1947 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1948 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1949 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1950 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1951 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1952 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1953 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1954 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1955 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1958 in[1] = _mm_sub_epi16(kZero, s[8]);
1960 in[3] = _mm_sub_epi16(kZero, s[4]);
1961 in[4] = _mm_packs_epi32(v[4], v[5]);
1962 in[5] = _mm_packs_epi32(v[12], v[13]);
1963 in[6] = _mm_packs_epi32(v[8], v[9]);
1964 in[7] = _mm_packs_epi32(v[0], v[1]);
1965 in[8] = _mm_packs_epi32(v[2], v[3]);
1966 in[9] = _mm_packs_epi32(v[10], v[11]);
1967 in[10] = _mm_packs_epi32(v[14], v[15]);
1968 in[11] = _mm_packs_epi32(v[6], v[7]);
1970 in[13] = _mm_sub_epi16(kZero, s[13]);
1972 in[15] = _mm_sub_epi16(kZero, s[1]);
1975 static void idct16_8col(__m128i *in) {
1976 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1977 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1978 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1979 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1980 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1981 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1982 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1983 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1984 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1985 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1986 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1987 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1988 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1989 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1990 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1991 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1992 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1993 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1994 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1995 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1996 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1997 __m128i v[16], u[16], s[16], t[16];
2018 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2019 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2020 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2021 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2022 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2023 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2024 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2025 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2027 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2028 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2029 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2030 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2031 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2032 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2033 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2034 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2035 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2036 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2037 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2038 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2039 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2040 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2041 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2042 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2044 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2045 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2046 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2047 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2048 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2049 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2050 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2051 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2052 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2053 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2054 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2055 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2056 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2057 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2058 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2059 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2061 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2062 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2063 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2064 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2065 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2066 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2067 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2068 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2069 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2070 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2071 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2072 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2073 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2074 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2075 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2076 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2078 s[8] = _mm_packs_epi32(u[0], u[1]);
2079 s[15] = _mm_packs_epi32(u[2], u[3]);
2080 s[9] = _mm_packs_epi32(u[4], u[5]);
2081 s[14] = _mm_packs_epi32(u[6], u[7]);
2082 s[10] = _mm_packs_epi32(u[8], u[9]);
2083 s[13] = _mm_packs_epi32(u[10], u[11]);
2084 s[11] = _mm_packs_epi32(u[12], u[13]);
2085 s[12] = _mm_packs_epi32(u[14], u[15]);
2092 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2093 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2094 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2095 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2097 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2098 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2099 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2100 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2101 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2102 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2103 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2104 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2106 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2107 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2108 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2109 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2110 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2111 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2112 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2113 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2115 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2116 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2117 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2118 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2119 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2120 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2121 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2122 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2124 t[4] = _mm_packs_epi32(u[0], u[1]);
2125 t[7] = _mm_packs_epi32(u[2], u[3]);
2126 t[5] = _mm_packs_epi32(u[4], u[5]);
2127 t[6] = _mm_packs_epi32(u[6], u[7]);
2128 t[8] = _mm_add_epi16(s[8], s[9]);
2129 t[9] = _mm_sub_epi16(s[8], s[9]);
2130 t[10] = _mm_sub_epi16(s[11], s[10]);
2131 t[11] = _mm_add_epi16(s[10], s[11]);
2132 t[12] = _mm_add_epi16(s[12], s[13]);
2133 t[13] = _mm_sub_epi16(s[12], s[13]);
2134 t[14] = _mm_sub_epi16(s[15], s[14]);
2135 t[15] = _mm_add_epi16(s[14], s[15]);
2138 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2139 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2140 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2141 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2142 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2143 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2144 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2145 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2147 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2148 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2149 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2150 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2151 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2152 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2153 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2154 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2155 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2156 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2157 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2158 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2159 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2160 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2161 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2162 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2164 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2165 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2166 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2167 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2168 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2169 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2170 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2171 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2172 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2173 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2174 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2175 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2176 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2177 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2178 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2179 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2181 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2182 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2183 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2184 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2185 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2186 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2187 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2188 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2189 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2190 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2191 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2192 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2193 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2194 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2195 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2196 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2198 s[0] = _mm_packs_epi32(u[0], u[1]);
2199 s[1] = _mm_packs_epi32(u[2], u[3]);
2200 s[2] = _mm_packs_epi32(u[4], u[5]);
2201 s[3] = _mm_packs_epi32(u[6], u[7]);
2202 s[4] = _mm_add_epi16(t[4], t[5]);
2203 s[5] = _mm_sub_epi16(t[4], t[5]);
2204 s[6] = _mm_sub_epi16(t[7], t[6]);
2205 s[7] = _mm_add_epi16(t[6], t[7]);
2208 s[9] = _mm_packs_epi32(u[8], u[9]);
2209 s[14] = _mm_packs_epi32(u[10], u[11]);
2210 s[10] = _mm_packs_epi32(u[12], u[13]);
2211 s[13] = _mm_packs_epi32(u[14], u[15]);
2216 t[0] = _mm_add_epi16(s[0], s[3]);
2217 t[1] = _mm_add_epi16(s[1], s[2]);
2218 t[2] = _mm_sub_epi16(s[1], s[2]);
2219 t[3] = _mm_sub_epi16(s[0], s[3]);
2223 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2224 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2225 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2226 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2227 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2228 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2229 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2230 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2231 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2232 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2233 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2234 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2235 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2236 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2237 t[5] = _mm_packs_epi32(u[0], u[1]);
2238 t[6] = _mm_packs_epi32(u[2], u[3]);
2240 t[8] = _mm_add_epi16(s[8], s[11]);
2241 t[9] = _mm_add_epi16(s[9], s[10]);
2242 t[10] = _mm_sub_epi16(s[9], s[10]);
2243 t[11] = _mm_sub_epi16(s[8], s[11]);
2244 t[12] = _mm_sub_epi16(s[15], s[12]);
2245 t[13] = _mm_sub_epi16(s[14], s[13]);
2246 t[14] = _mm_add_epi16(s[13], s[14]);
2247 t[15] = _mm_add_epi16(s[12], s[15]);
2250 s[0] = _mm_add_epi16(t[0], t[7]);
2251 s[1] = _mm_add_epi16(t[1], t[6]);
2252 s[2] = _mm_add_epi16(t[2], t[5]);
2253 s[3] = _mm_add_epi16(t[3], t[4]);
2254 s[4] = _mm_sub_epi16(t[3], t[4]);
2255 s[5] = _mm_sub_epi16(t[2], t[5]);
2256 s[6] = _mm_sub_epi16(t[1], t[6]);
2257 s[7] = _mm_sub_epi16(t[0], t[7]);
2261 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2262 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2263 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2264 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2266 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2267 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2268 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2269 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2270 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2271 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2272 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2273 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2275 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2276 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2277 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2278 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2279 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2280 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2281 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2282 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2284 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2285 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2286 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2287 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2288 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2289 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2290 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2291 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2293 s[10] = _mm_packs_epi32(u[0], u[1]);
2294 s[13] = _mm_packs_epi32(u[2], u[3]);
2295 s[11] = _mm_packs_epi32(u[4], u[5]);
2296 s[12] = _mm_packs_epi32(u[6], u[7]);
2301 in[0] = _mm_add_epi16(s[0], s[15]);
2302 in[1] = _mm_add_epi16(s[1], s[14]);
2303 in[2] = _mm_add_epi16(s[2], s[13]);
2304 in[3] = _mm_add_epi16(s[3], s[12]);
2305 in[4] = _mm_add_epi16(s[4], s[11]);
2306 in[5] = _mm_add_epi16(s[5], s[10]);
2307 in[6] = _mm_add_epi16(s[6], s[9]);
2308 in[7] = _mm_add_epi16(s[7], s[8]);
2309 in[8] = _mm_sub_epi16(s[7], s[8]);
2310 in[9] = _mm_sub_epi16(s[6], s[9]);
2311 in[10] = _mm_sub_epi16(s[5], s[10]);
2312 in[11] = _mm_sub_epi16(s[4], s[11]);
2313 in[12] = _mm_sub_epi16(s[3], s[12]);
2314 in[13] = _mm_sub_epi16(s[2], s[13]);
2315 in[14] = _mm_sub_epi16(s[1], s[14]);
2316 in[15] = _mm_sub_epi16(s[0], s[15]);
2319 static void idct16_sse2(__m128i *in0, __m128i *in1) {
2320 array_transpose_16x16(in0, in1);
2325 static void iadst16_sse2(__m128i *in0, __m128i *in1) {
2326 array_transpose_16x16(in0, in1);
2331 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2333 __m128i in0[16], in1[16];
2335 load_buffer_8x16(input, in0);
2337 load_buffer_8x16(input, in1);
2341 idct16_sse2(in0, in1);
2342 idct16_sse2(in0, in1);
2345 idct16_sse2(in0, in1);
2346 iadst16_sse2(in0, in1);
2349 iadst16_sse2(in0, in1);
2350 idct16_sse2(in0, in1);
2352 case 3: // ADST_ADST
2353 iadst16_sse2(in0, in1);
2354 iadst16_sse2(in0, in1);
2361 write_buffer_8x16(dest, in0, stride);
2363 write_buffer_8x16(dest, in1, stride);
2366 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2368 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2369 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2370 const __m128i zero = _mm_setzero_si128();
2372 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2373 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2374 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2375 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2377 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2378 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2380 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2381 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2382 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2383 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2384 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2385 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2387 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2388 __m128i in[16], l[16];
2389 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2390 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2391 stp1_8_0, stp1_12_0;
2392 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2393 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2394 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2396 // First 1-D inverse DCT
2398 in[0] = _mm_load_si128((const __m128i *)input);
2399 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2400 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2401 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2403 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2407 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2408 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2410 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2411 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2412 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2413 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2415 tmp0 = _mm_add_epi32(tmp0, rounding);
2416 tmp2 = _mm_add_epi32(tmp2, rounding);
2417 tmp5 = _mm_add_epi32(tmp5, rounding);
2418 tmp7 = _mm_add_epi32(tmp7, rounding);
2420 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2421 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2422 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2423 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2425 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
2426 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2431 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2433 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2434 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2436 tmp0 = _mm_add_epi32(tmp0, rounding);
2437 tmp2 = _mm_add_epi32(tmp2, rounding);
2438 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2439 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2441 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2442 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2444 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2449 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2450 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2451 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2453 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2454 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2455 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2456 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2457 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2458 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2460 tmp0 = _mm_add_epi32(tmp0, rounding);
2461 tmp2 = _mm_add_epi32(tmp2, rounding);
2462 tmp1 = _mm_add_epi32(tmp1, rounding);
2463 tmp3 = _mm_add_epi32(tmp3, rounding);
2464 tmp5 = _mm_add_epi32(tmp5, rounding);
2465 tmp7 = _mm_add_epi32(tmp7, rounding);
2467 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2468 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2469 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2470 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2471 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2472 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2474 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2475 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2476 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2477 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2479 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2482 // Stage5 and Stage6
2484 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2485 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2486 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2487 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2489 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2490 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2491 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2492 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2494 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2495 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2496 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2497 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2502 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2503 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2504 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2506 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2507 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2508 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2509 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2510 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2511 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2513 tmp1 = _mm_add_epi32(tmp1, rounding);
2514 tmp3 = _mm_add_epi32(tmp3, rounding);
2515 tmp0 = _mm_add_epi32(tmp0, rounding);
2516 tmp2 = _mm_add_epi32(tmp2, rounding);
2517 tmp4 = _mm_add_epi32(tmp4, rounding);
2518 tmp6 = _mm_add_epi32(tmp6, rounding);
2520 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2521 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2522 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2523 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2524 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2525 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2527 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2529 stp2_10 = _mm_packs_epi32(tmp0, zero);
2530 stp2_13 = _mm_packs_epi32(tmp2, zero);
2531 stp2_11 = _mm_packs_epi32(tmp4, zero);
2532 stp2_12 = _mm_packs_epi32(tmp6, zero);
2534 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2535 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2536 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2537 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2539 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2540 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2541 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2542 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2543 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2544 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2545 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2546 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2549 // Stage7. Left 8x16 only.
2550 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2551 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2552 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2553 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2554 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2555 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2556 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2557 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2558 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2559 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2560 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2561 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2562 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2563 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2564 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2565 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2567 // Second 1-D inverse transform, performed per 8x16 block
2568 for (i = 0; i < 2; i++) {
2569 array_transpose_4X8(l + 8*i, in);
2574 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2575 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2576 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2577 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2578 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2579 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2580 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2581 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2582 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2583 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2584 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2585 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2586 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2587 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2588 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2589 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2591 // Final rounding and shift
2592 in[0] = _mm_adds_epi16(in[0], final_rounding);
2593 in[1] = _mm_adds_epi16(in[1], final_rounding);
2594 in[2] = _mm_adds_epi16(in[2], final_rounding);
2595 in[3] = _mm_adds_epi16(in[3], final_rounding);
2596 in[4] = _mm_adds_epi16(in[4], final_rounding);
2597 in[5] = _mm_adds_epi16(in[5], final_rounding);
2598 in[6] = _mm_adds_epi16(in[6], final_rounding);
2599 in[7] = _mm_adds_epi16(in[7], final_rounding);
2600 in[8] = _mm_adds_epi16(in[8], final_rounding);
2601 in[9] = _mm_adds_epi16(in[9], final_rounding);
2602 in[10] = _mm_adds_epi16(in[10], final_rounding);
2603 in[11] = _mm_adds_epi16(in[11], final_rounding);
2604 in[12] = _mm_adds_epi16(in[12], final_rounding);
2605 in[13] = _mm_adds_epi16(in[13], final_rounding);
2606 in[14] = _mm_adds_epi16(in[14], final_rounding);
2607 in[15] = _mm_adds_epi16(in[15], final_rounding);
2609 in[0] = _mm_srai_epi16(in[0], 6);
2610 in[1] = _mm_srai_epi16(in[1], 6);
2611 in[2] = _mm_srai_epi16(in[2], 6);
2612 in[3] = _mm_srai_epi16(in[3], 6);
2613 in[4] = _mm_srai_epi16(in[4], 6);
2614 in[5] = _mm_srai_epi16(in[5], 6);
2615 in[6] = _mm_srai_epi16(in[6], 6);
2616 in[7] = _mm_srai_epi16(in[7], 6);
2617 in[8] = _mm_srai_epi16(in[8], 6);
2618 in[9] = _mm_srai_epi16(in[9], 6);
2619 in[10] = _mm_srai_epi16(in[10], 6);
2620 in[11] = _mm_srai_epi16(in[11], 6);
2621 in[12] = _mm_srai_epi16(in[12], 6);
2622 in[13] = _mm_srai_epi16(in[13], 6);
2623 in[14] = _mm_srai_epi16(in[14], 6);
2624 in[15] = _mm_srai_epi16(in[15], 6);
2626 RECON_AND_STORE(dest, in[0]);
2627 RECON_AND_STORE(dest, in[1]);
2628 RECON_AND_STORE(dest, in[2]);
2629 RECON_AND_STORE(dest, in[3]);
2630 RECON_AND_STORE(dest, in[4]);
2631 RECON_AND_STORE(dest, in[5]);
2632 RECON_AND_STORE(dest, in[6]);
2633 RECON_AND_STORE(dest, in[7]);
2634 RECON_AND_STORE(dest, in[8]);
2635 RECON_AND_STORE(dest, in[9]);
2636 RECON_AND_STORE(dest, in[10]);
2637 RECON_AND_STORE(dest, in[11]);
2638 RECON_AND_STORE(dest, in[12]);
2639 RECON_AND_STORE(dest, in[13]);
2640 RECON_AND_STORE(dest, in[14]);
2641 RECON_AND_STORE(dest, in[15]);
2643 dest += 8 - (stride * 16);
2647 #define LOAD_DQCOEFF(reg, input) \
2649 reg = _mm_load_si128((const __m128i *) input); \
2656 const __m128i zero = _mm_setzero_si128();\
2657 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2658 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2660 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2661 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2663 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2664 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2666 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2667 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2669 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2670 stg1_1, stp1_16, stp1_31); \
2671 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2672 stg1_7, stp1_19, stp1_28); \
2673 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2674 stg1_9, stp1_20, stp1_27); \
2675 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2676 stg1_15, stp1_23, stp1_24); \
2681 const __m128i zero = _mm_setzero_si128();\
2682 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2683 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2685 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2686 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2688 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2689 stg2_1, stp2_8, stp2_15); \
2690 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2691 stg2_7, stp2_11, stp2_12); \
2693 stp2_16 = stp1_16; \
2694 stp2_19 = stp1_19; \
2696 stp2_20 = stp1_20; \
2697 stp2_23 = stp1_23; \
2699 stp2_24 = stp1_24; \
2700 stp2_27 = stp1_27; \
2702 stp2_28 = stp1_28; \
2703 stp2_31 = stp1_31; \
2708 const __m128i zero = _mm_setzero_si128();\
2709 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2710 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2712 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2713 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2714 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2715 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2717 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2718 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2719 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2720 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2722 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2723 stg3_1, stp1_4, stp1_7); \
2726 stp1_11 = stp2_11; \
2727 stp1_12 = stp2_12; \
2728 stp1_15 = stp2_15; \
2730 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2731 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2733 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2734 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2737 stp1_16 = stp2_16; \
2738 stp1_31 = stp2_31; \
2739 stp1_19 = stp2_19; \
2740 stp1_20 = stp2_20; \
2741 stp1_23 = stp2_23; \
2742 stp1_24 = stp2_24; \
2743 stp1_27 = stp2_27; \
2744 stp1_28 = stp2_28; \
2749 const __m128i zero = _mm_setzero_si128();\
2750 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2751 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2753 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2754 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2755 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2756 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2758 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2759 stg4_1, stp2_0, stp2_1); \
2766 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2767 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2771 stp2_15 = stp1_15; \
2772 stp2_11 = stp1_11; \
2773 stp2_12 = stp1_12; \
2775 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2776 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2777 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2778 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2779 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2780 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2781 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2782 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2784 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2785 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2786 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2787 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2788 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2789 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2790 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2791 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2796 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2797 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2798 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2799 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2801 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2802 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2803 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2804 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2806 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2807 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2814 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2815 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2816 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2817 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2819 tmp0 = _mm_add_epi32(tmp0, rounding); \
2820 tmp1 = _mm_add_epi32(tmp1, rounding); \
2821 tmp2 = _mm_add_epi32(tmp2, rounding); \
2822 tmp3 = _mm_add_epi32(tmp3, rounding); \
2824 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2825 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2826 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2827 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2829 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2830 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2835 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2836 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2837 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2838 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2839 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2840 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2841 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2842 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2844 stp1_16 = stp2_16; \
2845 stp1_17 = stp2_17; \
2847 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2848 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2850 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2851 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2854 stp1_22 = stp2_22; \
2855 stp1_23 = stp2_23; \
2856 stp1_24 = stp2_24; \
2857 stp1_25 = stp2_25; \
2858 stp1_30 = stp2_30; \
2859 stp1_31 = stp2_31; \
2864 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2865 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2866 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2867 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2869 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2870 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2871 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2872 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2873 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2874 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2875 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2876 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2880 stp2_14 = stp1_14; \
2881 stp2_15 = stp1_15; \
2883 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2884 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2885 stp2_13, stp2_11, stp2_12) \
2887 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2888 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2889 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2890 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2891 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2892 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2893 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2894 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2896 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2897 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2898 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2899 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2900 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2901 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2902 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2903 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2908 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2909 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2910 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2911 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2913 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2914 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2915 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2916 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2918 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2919 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2920 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2921 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2922 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2923 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2924 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2925 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2926 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2927 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2928 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2929 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2930 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2931 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2932 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2933 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2935 stp1_16 = stp2_16; \
2936 stp1_17 = stp2_17; \
2937 stp1_18 = stp2_18; \
2938 stp1_19 = stp2_19; \
2940 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2941 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2943 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2944 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2947 stp1_28 = stp2_28; \
2948 stp1_29 = stp2_29; \
2949 stp1_30 = stp2_30; \
2950 stp1_31 = stp2_31; \
2957 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2958 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2959 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2960 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2962 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2963 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2964 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2965 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2967 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2968 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2969 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2970 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2972 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2973 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2974 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2975 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2977 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2978 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2980 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2981 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2983 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2984 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2986 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2987 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2993 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2994 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2995 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2996 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2998 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2999 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3000 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3001 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
3003 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
3004 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
3006 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
3007 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
3010 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
3011 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
3012 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
3013 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
3015 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
3016 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
3017 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
3018 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
3020 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
3021 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
3022 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
3023 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
3025 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
3026 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
3027 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
3028 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
3033 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3034 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3035 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3036 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
3038 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
3039 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
3040 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3041 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3043 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3044 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3045 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3046 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3048 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
3049 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
3052 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
3053 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
3054 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
3055 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
3056 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
3057 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
3058 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
3059 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
3061 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
3062 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
3064 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
3065 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
3068 stp1_16 = stp2_16; \
3069 stp1_31 = stp2_31; \
3070 stp1_19 = stp2_19; \
3071 stp1_20 = stp2_20; \
3072 stp1_23 = stp2_23; \
3073 stp1_24 = stp2_24; \
3074 stp1_27 = stp2_27; \
3075 stp1_28 = stp2_28; \
3080 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3081 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3082 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3083 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
3085 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
3086 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
3087 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3088 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3090 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
3091 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
3094 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
3095 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
3096 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
3097 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
3099 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
3100 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
3104 stp2_15 = stp1_15; \
3105 stp2_11 = stp1_11; \
3106 stp2_12 = stp1_12; \
3108 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
3109 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
3110 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
3111 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
3112 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
3113 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
3114 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
3115 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
3117 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
3118 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
3119 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
3120 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
3121 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
3122 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
3123 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
3124 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
3129 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
3130 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
3131 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3132 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3134 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
3135 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
3136 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3137 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3139 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3140 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3142 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
3143 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
3144 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
3145 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
3147 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
3148 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
3149 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
3150 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
3152 tmp0 = _mm_add_epi32(tmp0, rounding); \
3153 tmp1 = _mm_add_epi32(tmp1, rounding); \
3154 tmp2 = _mm_add_epi32(tmp2, rounding); \
3155 tmp3 = _mm_add_epi32(tmp3, rounding); \
3157 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
3158 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3159 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3160 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3162 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3163 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3168 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3169 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3170 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3171 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3172 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3173 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3174 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3175 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3177 stp1_16 = stp2_16; \
3178 stp1_17 = stp2_17; \
3180 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3181 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3183 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3184 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3187 stp1_22 = stp2_22; \
3188 stp1_23 = stp2_23; \
3189 stp1_24 = stp2_24; \
3190 stp1_25 = stp2_25; \
3191 stp1_30 = stp2_30; \
3192 stp1_31 = stp2_31; \
3197 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3198 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3199 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3200 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3202 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3203 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3204 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3205 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3206 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3207 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3208 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3209 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3213 stp2_14 = stp1_14; \
3214 stp2_15 = stp1_15; \
3216 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3217 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3218 stp2_13, stp2_11, stp2_12) \
3220 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3221 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3222 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3223 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3224 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3225 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3226 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3227 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3229 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3230 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3231 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3232 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3233 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3234 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3235 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3236 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3241 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3242 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3243 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3244 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3246 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3247 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3248 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3249 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3251 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3252 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3253 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3254 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3255 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3256 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3257 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3258 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3259 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3260 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3261 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3262 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3263 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3264 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3265 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3266 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3268 stp1_16 = stp2_16; \
3269 stp1_17 = stp2_17; \
3270 stp1_18 = stp2_18; \
3271 stp1_19 = stp2_19; \
3273 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3274 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3276 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3277 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3280 stp1_28 = stp2_28; \
3281 stp1_29 = stp2_29; \
3282 stp1_30 = stp2_30; \
3283 stp1_31 = stp2_31; \
3286 // Only upper-left 8x8 has non-zero coeff
3287 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3289 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3290 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3292 // idct constants for each stage
3293 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3294 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3295 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3296 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3297 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3298 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3299 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3300 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3301 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3302 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3303 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3304 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3305 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3306 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3307 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3308 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3310 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3311 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3312 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3313 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3314 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3315 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3316 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3317 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3319 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3320 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3321 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3322 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3323 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3324 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3325 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3326 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3327 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3328 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3330 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3331 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3332 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3333 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3334 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3335 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3336 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3338 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3340 __m128i in[32], col[32];
3341 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3342 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3343 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3344 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3346 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3347 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3348 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3349 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3351 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3354 LOAD_DQCOEFF(in[0], input);
3355 LOAD_DQCOEFF(in[8], input);
3356 LOAD_DQCOEFF(in[16], input);
3357 LOAD_DQCOEFF(in[24], input);
3358 LOAD_DQCOEFF(in[1], input);
3359 LOAD_DQCOEFF(in[9], input);
3360 LOAD_DQCOEFF(in[17], input);
3361 LOAD_DQCOEFF(in[25], input);
3362 LOAD_DQCOEFF(in[2], input);
3363 LOAD_DQCOEFF(in[10], input);
3364 LOAD_DQCOEFF(in[18], input);
3365 LOAD_DQCOEFF(in[26], input);
3366 LOAD_DQCOEFF(in[3], input);
3367 LOAD_DQCOEFF(in[11], input);
3368 LOAD_DQCOEFF(in[19], input);
3369 LOAD_DQCOEFF(in[27], input);
3371 LOAD_DQCOEFF(in[4], input);
3372 LOAD_DQCOEFF(in[12], input);
3373 LOAD_DQCOEFF(in[20], input);
3374 LOAD_DQCOEFF(in[28], input);
3375 LOAD_DQCOEFF(in[5], input);
3376 LOAD_DQCOEFF(in[13], input);
3377 LOAD_DQCOEFF(in[21], input);
3378 LOAD_DQCOEFF(in[29], input);
3379 LOAD_DQCOEFF(in[6], input);
3380 LOAD_DQCOEFF(in[14], input);
3381 LOAD_DQCOEFF(in[22], input);
3382 LOAD_DQCOEFF(in[30], input);
3383 LOAD_DQCOEFF(in[7], input);
3384 LOAD_DQCOEFF(in[15], input);
3385 LOAD_DQCOEFF(in[23], input);
3386 LOAD_DQCOEFF(in[31], input);
3388 array_transpose_8x8(in, in);
3389 array_transpose_8x8(in+8, in+8);
3390 array_transpose_8x8(in+16, in+16);
3391 array_transpose_8x8(in+24, in+24);
3395 // 1_D: Store 32 intermediate results for each 8x32 block.
3396 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3397 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3398 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3399 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3400 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3401 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3402 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3403 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3404 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3405 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3406 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3407 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3408 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3409 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3410 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3411 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3412 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3413 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3414 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3415 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3416 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3417 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3418 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3419 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3420 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3421 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3422 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3423 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3424 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3425 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3426 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3427 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3428 for (i = 0; i < 4; i++) {
3429 const __m128i zero = _mm_setzero_si128();
3430 // Transpose 32x8 block to 8x32 block
3431 array_transpose_8x8(col+i*8, in);
3434 // 2_D: Calculate the results and store them to destination.
3435 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3436 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3437 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3438 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3439 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3440 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3441 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3442 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3443 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3444 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3445 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3446 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3447 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3448 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3449 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3450 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3451 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3452 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3453 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3454 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3455 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3456 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3457 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3458 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3459 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3460 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3461 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3462 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3463 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3464 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3465 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3466 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3468 // Final rounding and shift
3469 in[0] = _mm_adds_epi16(in[0], final_rounding);
3470 in[1] = _mm_adds_epi16(in[1], final_rounding);
3471 in[2] = _mm_adds_epi16(in[2], final_rounding);
3472 in[3] = _mm_adds_epi16(in[3], final_rounding);
3473 in[4] = _mm_adds_epi16(in[4], final_rounding);
3474 in[5] = _mm_adds_epi16(in[5], final_rounding);
3475 in[6] = _mm_adds_epi16(in[6], final_rounding);
3476 in[7] = _mm_adds_epi16(in[7], final_rounding);
3477 in[8] = _mm_adds_epi16(in[8], final_rounding);
3478 in[9] = _mm_adds_epi16(in[9], final_rounding);
3479 in[10] = _mm_adds_epi16(in[10], final_rounding);
3480 in[11] = _mm_adds_epi16(in[11], final_rounding);
3481 in[12] = _mm_adds_epi16(in[12], final_rounding);
3482 in[13] = _mm_adds_epi16(in[13], final_rounding);
3483 in[14] = _mm_adds_epi16(in[14], final_rounding);
3484 in[15] = _mm_adds_epi16(in[15], final_rounding);
3485 in[16] = _mm_adds_epi16(in[16], final_rounding);
3486 in[17] = _mm_adds_epi16(in[17], final_rounding);
3487 in[18] = _mm_adds_epi16(in[18], final_rounding);
3488 in[19] = _mm_adds_epi16(in[19], final_rounding);
3489 in[20] = _mm_adds_epi16(in[20], final_rounding);
3490 in[21] = _mm_adds_epi16(in[21], final_rounding);
3491 in[22] = _mm_adds_epi16(in[22], final_rounding);
3492 in[23] = _mm_adds_epi16(in[23], final_rounding);
3493 in[24] = _mm_adds_epi16(in[24], final_rounding);
3494 in[25] = _mm_adds_epi16(in[25], final_rounding);
3495 in[26] = _mm_adds_epi16(in[26], final_rounding);
3496 in[27] = _mm_adds_epi16(in[27], final_rounding);
3497 in[28] = _mm_adds_epi16(in[28], final_rounding);
3498 in[29] = _mm_adds_epi16(in[29], final_rounding);
3499 in[30] = _mm_adds_epi16(in[30], final_rounding);
3500 in[31] = _mm_adds_epi16(in[31], final_rounding);
3502 in[0] = _mm_srai_epi16(in[0], 6);
3503 in[1] = _mm_srai_epi16(in[1], 6);
3504 in[2] = _mm_srai_epi16(in[2], 6);
3505 in[3] = _mm_srai_epi16(in[3], 6);
3506 in[4] = _mm_srai_epi16(in[4], 6);
3507 in[5] = _mm_srai_epi16(in[5], 6);
3508 in[6] = _mm_srai_epi16(in[6], 6);
3509 in[7] = _mm_srai_epi16(in[7], 6);
3510 in[8] = _mm_srai_epi16(in[8], 6);
3511 in[9] = _mm_srai_epi16(in[9], 6);
3512 in[10] = _mm_srai_epi16(in[10], 6);
3513 in[11] = _mm_srai_epi16(in[11], 6);
3514 in[12] = _mm_srai_epi16(in[12], 6);
3515 in[13] = _mm_srai_epi16(in[13], 6);
3516 in[14] = _mm_srai_epi16(in[14], 6);
3517 in[15] = _mm_srai_epi16(in[15], 6);
3518 in[16] = _mm_srai_epi16(in[16], 6);
3519 in[17] = _mm_srai_epi16(in[17], 6);
3520 in[18] = _mm_srai_epi16(in[18], 6);
3521 in[19] = _mm_srai_epi16(in[19], 6);
3522 in[20] = _mm_srai_epi16(in[20], 6);
3523 in[21] = _mm_srai_epi16(in[21], 6);
3524 in[22] = _mm_srai_epi16(in[22], 6);
3525 in[23] = _mm_srai_epi16(in[23], 6);
3526 in[24] = _mm_srai_epi16(in[24], 6);
3527 in[25] = _mm_srai_epi16(in[25], 6);
3528 in[26] = _mm_srai_epi16(in[26], 6);
3529 in[27] = _mm_srai_epi16(in[27], 6);
3530 in[28] = _mm_srai_epi16(in[28], 6);
3531 in[29] = _mm_srai_epi16(in[29], 6);
3532 in[30] = _mm_srai_epi16(in[30], 6);
3533 in[31] = _mm_srai_epi16(in[31], 6);
3535 RECON_AND_STORE(dest, in[0]);
3536 RECON_AND_STORE(dest, in[1]);
3537 RECON_AND_STORE(dest, in[2]);
3538 RECON_AND_STORE(dest, in[3]);
3539 RECON_AND_STORE(dest, in[4]);
3540 RECON_AND_STORE(dest, in[5]);
3541 RECON_AND_STORE(dest, in[6]);
3542 RECON_AND_STORE(dest, in[7]);
3543 RECON_AND_STORE(dest, in[8]);
3544 RECON_AND_STORE(dest, in[9]);
3545 RECON_AND_STORE(dest, in[10]);
3546 RECON_AND_STORE(dest, in[11]);
3547 RECON_AND_STORE(dest, in[12]);
3548 RECON_AND_STORE(dest, in[13]);
3549 RECON_AND_STORE(dest, in[14]);
3550 RECON_AND_STORE(dest, in[15]);
3551 RECON_AND_STORE(dest, in[16]);
3552 RECON_AND_STORE(dest, in[17]);
3553 RECON_AND_STORE(dest, in[18]);
3554 RECON_AND_STORE(dest, in[19]);
3555 RECON_AND_STORE(dest, in[20]);
3556 RECON_AND_STORE(dest, in[21]);
3557 RECON_AND_STORE(dest, in[22]);
3558 RECON_AND_STORE(dest, in[23]);
3559 RECON_AND_STORE(dest, in[24]);
3560 RECON_AND_STORE(dest, in[25]);
3561 RECON_AND_STORE(dest, in[26]);
3562 RECON_AND_STORE(dest, in[27]);
3563 RECON_AND_STORE(dest, in[28]);
3564 RECON_AND_STORE(dest, in[29]);
3565 RECON_AND_STORE(dest, in[30]);
3566 RECON_AND_STORE(dest, in[31]);
3568 dest += 8 - (stride * 32);
3572 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3575 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3576 const __m128i zero = _mm_setzero_si128();
3578 // idct constants for each stage
3579 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3580 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3581 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3582 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3583 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3584 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3585 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3586 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3587 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3588 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3589 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3590 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3591 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3592 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3593 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3594 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3596 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3597 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3598 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3599 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3600 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3601 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3602 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3603 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3605 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3606 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3607 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3608 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3609 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3610 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3611 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3612 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3613 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3614 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3616 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3617 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3618 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3619 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3620 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3621 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3622 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3624 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3626 __m128i in[32], col[128], zero_idx[16];
3627 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3628 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3629 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3630 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3632 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3633 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3634 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3635 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3640 for (i = 0; i < 4; i++) {
3644 LOAD_DQCOEFF(in[0], input);
3645 LOAD_DQCOEFF(in[8], input);
3646 LOAD_DQCOEFF(in[16], input);
3647 LOAD_DQCOEFF(in[24], input);
3648 LOAD_DQCOEFF(in[1], input);
3649 LOAD_DQCOEFF(in[9], input);
3650 LOAD_DQCOEFF(in[17], input);
3651 LOAD_DQCOEFF(in[25], input);
3652 LOAD_DQCOEFF(in[2], input);
3653 LOAD_DQCOEFF(in[10], input);
3654 LOAD_DQCOEFF(in[18], input);
3655 LOAD_DQCOEFF(in[26], input);
3656 LOAD_DQCOEFF(in[3], input);
3657 LOAD_DQCOEFF(in[11], input);
3658 LOAD_DQCOEFF(in[19], input);
3659 LOAD_DQCOEFF(in[27], input);
3661 LOAD_DQCOEFF(in[4], input);
3662 LOAD_DQCOEFF(in[12], input);
3663 LOAD_DQCOEFF(in[20], input);
3664 LOAD_DQCOEFF(in[28], input);
3665 LOAD_DQCOEFF(in[5], input);
3666 LOAD_DQCOEFF(in[13], input);
3667 LOAD_DQCOEFF(in[21], input);
3668 LOAD_DQCOEFF(in[29], input);
3669 LOAD_DQCOEFF(in[6], input);
3670 LOAD_DQCOEFF(in[14], input);
3671 LOAD_DQCOEFF(in[22], input);
3672 LOAD_DQCOEFF(in[30], input);
3673 LOAD_DQCOEFF(in[7], input);
3674 LOAD_DQCOEFF(in[15], input);
3675 LOAD_DQCOEFF(in[23], input);
3676 LOAD_DQCOEFF(in[31], input);
3678 // checking if all entries are zero
3679 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3680 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3681 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3682 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3683 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3684 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3685 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3686 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3687 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3688 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3689 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3690 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3691 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3692 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3693 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3694 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3696 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3697 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3698 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3699 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3700 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3701 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3702 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3713 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3714 col[i32 + 0] = _mm_setzero_si128();
3715 col[i32 + 1] = _mm_setzero_si128();
3716 col[i32 + 2] = _mm_setzero_si128();
3717 col[i32 + 3] = _mm_setzero_si128();
3718 col[i32 + 4] = _mm_setzero_si128();
3719 col[i32 + 5] = _mm_setzero_si128();
3720 col[i32 + 6] = _mm_setzero_si128();
3721 col[i32 + 7] = _mm_setzero_si128();
3722 col[i32 + 8] = _mm_setzero_si128();
3723 col[i32 + 9] = _mm_setzero_si128();
3724 col[i32 + 10] = _mm_setzero_si128();
3725 col[i32 + 11] = _mm_setzero_si128();
3726 col[i32 + 12] = _mm_setzero_si128();
3727 col[i32 + 13] = _mm_setzero_si128();
3728 col[i32 + 14] = _mm_setzero_si128();
3729 col[i32 + 15] = _mm_setzero_si128();
3730 col[i32 + 16] = _mm_setzero_si128();
3731 col[i32 + 17] = _mm_setzero_si128();
3732 col[i32 + 18] = _mm_setzero_si128();
3733 col[i32 + 19] = _mm_setzero_si128();
3734 col[i32 + 20] = _mm_setzero_si128();
3735 col[i32 + 21] = _mm_setzero_si128();
3736 col[i32 + 22] = _mm_setzero_si128();
3737 col[i32 + 23] = _mm_setzero_si128();
3738 col[i32 + 24] = _mm_setzero_si128();
3739 col[i32 + 25] = _mm_setzero_si128();
3740 col[i32 + 26] = _mm_setzero_si128();
3741 col[i32 + 27] = _mm_setzero_si128();
3742 col[i32 + 28] = _mm_setzero_si128();
3743 col[i32 + 29] = _mm_setzero_si128();
3744 col[i32 + 30] = _mm_setzero_si128();
3745 col[i32 + 31] = _mm_setzero_si128();
3749 // Transpose 32x8 block to 8x32 block
3750 array_transpose_8x8(in, in);
3751 array_transpose_8x8(in+8, in+8);
3752 array_transpose_8x8(in+16, in+16);
3753 array_transpose_8x8(in+24, in+24);
3757 // 1_D: Store 32 intermediate results for each 8x32 block.
3758 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3759 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3760 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3761 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3762 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3763 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3764 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3765 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3766 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3767 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3768 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3769 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3770 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3771 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3772 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3773 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3774 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3775 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3776 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3777 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3778 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3779 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3780 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3781 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3782 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3783 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3784 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3785 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3786 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3787 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3788 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3789 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3791 for (i = 0; i < 4; i++) {
3795 // Transpose 32x8 block to 8x32 block
3796 array_transpose_8x8(col+j, in);
3797 array_transpose_8x8(col+j+32, in+8);
3798 array_transpose_8x8(col+j+64, in+16);
3799 array_transpose_8x8(col+j+96, in+24);
3803 // 2_D: Calculate the results and store them to destination.
3804 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3805 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3806 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3807 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3808 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3809 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3810 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3811 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3812 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3813 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3814 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3815 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3816 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3817 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3818 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3819 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3820 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3821 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3822 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3823 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3824 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3825 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3826 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3827 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3828 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3829 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3830 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3831 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3832 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3833 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3834 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3835 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3837 // Final rounding and shift
3838 in[0] = _mm_adds_epi16(in[0], final_rounding);
3839 in[1] = _mm_adds_epi16(in[1], final_rounding);
3840 in[2] = _mm_adds_epi16(in[2], final_rounding);
3841 in[3] = _mm_adds_epi16(in[3], final_rounding);
3842 in[4] = _mm_adds_epi16(in[4], final_rounding);
3843 in[5] = _mm_adds_epi16(in[5], final_rounding);
3844 in[6] = _mm_adds_epi16(in[6], final_rounding);
3845 in[7] = _mm_adds_epi16(in[7], final_rounding);
3846 in[8] = _mm_adds_epi16(in[8], final_rounding);
3847 in[9] = _mm_adds_epi16(in[9], final_rounding);
3848 in[10] = _mm_adds_epi16(in[10], final_rounding);
3849 in[11] = _mm_adds_epi16(in[11], final_rounding);
3850 in[12] = _mm_adds_epi16(in[12], final_rounding);
3851 in[13] = _mm_adds_epi16(in[13], final_rounding);
3852 in[14] = _mm_adds_epi16(in[14], final_rounding);
3853 in[15] = _mm_adds_epi16(in[15], final_rounding);
3854 in[16] = _mm_adds_epi16(in[16], final_rounding);
3855 in[17] = _mm_adds_epi16(in[17], final_rounding);
3856 in[18] = _mm_adds_epi16(in[18], final_rounding);
3857 in[19] = _mm_adds_epi16(in[19], final_rounding);
3858 in[20] = _mm_adds_epi16(in[20], final_rounding);
3859 in[21] = _mm_adds_epi16(in[21], final_rounding);
3860 in[22] = _mm_adds_epi16(in[22], final_rounding);
3861 in[23] = _mm_adds_epi16(in[23], final_rounding);
3862 in[24] = _mm_adds_epi16(in[24], final_rounding);
3863 in[25] = _mm_adds_epi16(in[25], final_rounding);
3864 in[26] = _mm_adds_epi16(in[26], final_rounding);
3865 in[27] = _mm_adds_epi16(in[27], final_rounding);
3866 in[28] = _mm_adds_epi16(in[28], final_rounding);
3867 in[29] = _mm_adds_epi16(in[29], final_rounding);
3868 in[30] = _mm_adds_epi16(in[30], final_rounding);
3869 in[31] = _mm_adds_epi16(in[31], final_rounding);
3871 in[0] = _mm_srai_epi16(in[0], 6);
3872 in[1] = _mm_srai_epi16(in[1], 6);
3873 in[2] = _mm_srai_epi16(in[2], 6);
3874 in[3] = _mm_srai_epi16(in[3], 6);
3875 in[4] = _mm_srai_epi16(in[4], 6);
3876 in[5] = _mm_srai_epi16(in[5], 6);
3877 in[6] = _mm_srai_epi16(in[6], 6);
3878 in[7] = _mm_srai_epi16(in[7], 6);
3879 in[8] = _mm_srai_epi16(in[8], 6);
3880 in[9] = _mm_srai_epi16(in[9], 6);
3881 in[10] = _mm_srai_epi16(in[10], 6);
3882 in[11] = _mm_srai_epi16(in[11], 6);
3883 in[12] = _mm_srai_epi16(in[12], 6);
3884 in[13] = _mm_srai_epi16(in[13], 6);
3885 in[14] = _mm_srai_epi16(in[14], 6);
3886 in[15] = _mm_srai_epi16(in[15], 6);
3887 in[16] = _mm_srai_epi16(in[16], 6);
3888 in[17] = _mm_srai_epi16(in[17], 6);
3889 in[18] = _mm_srai_epi16(in[18], 6);
3890 in[19] = _mm_srai_epi16(in[19], 6);
3891 in[20] = _mm_srai_epi16(in[20], 6);
3892 in[21] = _mm_srai_epi16(in[21], 6);
3893 in[22] = _mm_srai_epi16(in[22], 6);
3894 in[23] = _mm_srai_epi16(in[23], 6);
3895 in[24] = _mm_srai_epi16(in[24], 6);
3896 in[25] = _mm_srai_epi16(in[25], 6);
3897 in[26] = _mm_srai_epi16(in[26], 6);
3898 in[27] = _mm_srai_epi16(in[27], 6);
3899 in[28] = _mm_srai_epi16(in[28], 6);
3900 in[29] = _mm_srai_epi16(in[29], 6);
3901 in[30] = _mm_srai_epi16(in[30], 6);
3902 in[31] = _mm_srai_epi16(in[31], 6);
3904 RECON_AND_STORE(dest, in[0]);
3905 RECON_AND_STORE(dest, in[1]);
3906 RECON_AND_STORE(dest, in[2]);
3907 RECON_AND_STORE(dest, in[3]);
3908 RECON_AND_STORE(dest, in[4]);
3909 RECON_AND_STORE(dest, in[5]);
3910 RECON_AND_STORE(dest, in[6]);
3911 RECON_AND_STORE(dest, in[7]);
3912 RECON_AND_STORE(dest, in[8]);
3913 RECON_AND_STORE(dest, in[9]);
3914 RECON_AND_STORE(dest, in[10]);
3915 RECON_AND_STORE(dest, in[11]);
3916 RECON_AND_STORE(dest, in[12]);
3917 RECON_AND_STORE(dest, in[13]);
3918 RECON_AND_STORE(dest, in[14]);
3919 RECON_AND_STORE(dest, in[15]);
3920 RECON_AND_STORE(dest, in[16]);
3921 RECON_AND_STORE(dest, in[17]);
3922 RECON_AND_STORE(dest, in[18]);
3923 RECON_AND_STORE(dest, in[19]);
3924 RECON_AND_STORE(dest, in[20]);
3925 RECON_AND_STORE(dest, in[21]);
3926 RECON_AND_STORE(dest, in[22]);
3927 RECON_AND_STORE(dest, in[23]);
3928 RECON_AND_STORE(dest, in[24]);
3929 RECON_AND_STORE(dest, in[25]);
3930 RECON_AND_STORE(dest, in[26]);
3931 RECON_AND_STORE(dest, in[27]);
3932 RECON_AND_STORE(dest, in[28]);
3933 RECON_AND_STORE(dest, in[29]);
3934 RECON_AND_STORE(dest, in[30]);
3935 RECON_AND_STORE(dest, in[31]);
3937 dest += 8 - (stride * 32);
3941 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3943 const __m128i zero = _mm_setzero_si128();
3946 a = dct_const_round_shift(input[0] * cospi_16_64);
3947 a = dct_const_round_shift(a * cospi_16_64);
3948 a = ROUND_POWER_OF_TWO(a, 6);
3950 dc_value = _mm_set1_epi16(a);
3952 for (i = 0; i < 4; ++i) {
3953 RECON_AND_STORE(dest, dc_value);
3954 RECON_AND_STORE(dest, dc_value);
3955 RECON_AND_STORE(dest, dc_value);
3956 RECON_AND_STORE(dest, dc_value);
3957 RECON_AND_STORE(dest, dc_value);
3958 RECON_AND_STORE(dest, dc_value);
3959 RECON_AND_STORE(dest, dc_value);
3960 RECON_AND_STORE(dest, dc_value);
3961 RECON_AND_STORE(dest, dc_value);
3962 RECON_AND_STORE(dest, dc_value);
3963 RECON_AND_STORE(dest, dc_value);
3964 RECON_AND_STORE(dest, dc_value);
3965 RECON_AND_STORE(dest, dc_value);
3966 RECON_AND_STORE(dest, dc_value);
3967 RECON_AND_STORE(dest, dc_value);
3968 RECON_AND_STORE(dest, dc_value);
3969 RECON_AND_STORE(dest, dc_value);
3970 RECON_AND_STORE(dest, dc_value);
3971 RECON_AND_STORE(dest, dc_value);
3972 RECON_AND_STORE(dest, dc_value);
3973 RECON_AND_STORE(dest, dc_value);
3974 RECON_AND_STORE(dest, dc_value);
3975 RECON_AND_STORE(dest, dc_value);
3976 RECON_AND_STORE(dest, dc_value);
3977 RECON_AND_STORE(dest, dc_value);
3978 RECON_AND_STORE(dest, dc_value);
3979 RECON_AND_STORE(dest, dc_value);
3980 RECON_AND_STORE(dest, dc_value);
3981 RECON_AND_STORE(dest, dc_value);
3982 RECON_AND_STORE(dest, dc_value);
3983 RECON_AND_STORE(dest, dc_value);
3984 RECON_AND_STORE(dest, dc_value);
3985 dest += 8 - (stride * 32);