2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include <emmintrin.h> // SSE2
13 #include "./vpx_config.h"
14 #include "vpx/vpx_integer.h"
15 #include "vp9/common/vp9_common.h"
16 #include "vp9/common/vp9_idct.h"
18 #define RECON_AND_STORE4X4(dest, in_x) \
20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21 d0 = _mm_unpacklo_epi8(d0, zero); \
22 d0 = _mm_add_epi16(in_x, d0); \
23 d0 = _mm_packus_epi16(d0, d0); \
24 *(int *)dest = _mm_cvtsi128_si32(d0); \
28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29 const __m128i zero = _mm_setzero_si128();
30 const __m128i eight = _mm_set1_epi16(8);
31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36 __m128i input0, input1, input2, input3;
39 input0 = _mm_load_si128((const __m128i *)input);
40 input2 = _mm_load_si128((const __m128i *)(input + 8));
42 // Construct i3, i1, i3, i1, i2, i0, i2, i0
43 input0 = _mm_shufflelo_epi16(input0, 0xd8);
44 input0 = _mm_shufflehi_epi16(input0, 0xd8);
45 input2 = _mm_shufflelo_epi16(input2, 0xd8);
46 input2 = _mm_shufflehi_epi16(input2, 0xd8);
48 input1 = _mm_unpackhi_epi32(input0, input0);
49 input0 = _mm_unpacklo_epi32(input0, input0);
50 input3 = _mm_unpackhi_epi32(input2, input2);
51 input2 = _mm_unpacklo_epi32(input2, input2);
54 input0 = _mm_madd_epi16(input0, cst);
55 input1 = _mm_madd_epi16(input1, cst);
56 input2 = _mm_madd_epi16(input2, cst);
57 input3 = _mm_madd_epi16(input3, cst);
59 input0 = _mm_add_epi32(input0, rounding);
60 input1 = _mm_add_epi32(input1, rounding);
61 input2 = _mm_add_epi32(input2, rounding);
62 input3 = _mm_add_epi32(input3, rounding);
64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
70 input0 = _mm_packs_epi32(input0, input1);
71 input1 = _mm_packs_epi32(input2, input3);
74 input2 = _mm_unpacklo_epi16(input0, input1);
75 input3 = _mm_unpackhi_epi16(input0, input1);
76 input0 = _mm_unpacklo_epi32(input2, input3);
77 input1 = _mm_unpackhi_epi32(input2, input3);
79 // Switch column2, column 3, and then, we got:
80 // input2: column1, column 0; input3: column2, column 3.
81 input1 = _mm_shuffle_epi32(input1, 0x4e);
82 input2 = _mm_add_epi16(input0, input1);
83 input3 = _mm_sub_epi16(input0, input1);
86 // Construct i3, i1, i3, i1, i2, i0, i2, i0
87 input0 = _mm_unpacklo_epi32(input2, input2);
88 input1 = _mm_unpackhi_epi32(input2, input2);
89 input2 = _mm_unpackhi_epi32(input3, input3);
90 input3 = _mm_unpacklo_epi32(input3, input3);
93 input0 = _mm_madd_epi16(input0, cst);
94 input1 = _mm_madd_epi16(input1, cst);
95 input2 = _mm_madd_epi16(input2, cst);
96 input3 = _mm_madd_epi16(input3, cst);
98 input0 = _mm_add_epi32(input0, rounding);
99 input1 = _mm_add_epi32(input1, rounding);
100 input2 = _mm_add_epi32(input2, rounding);
101 input3 = _mm_add_epi32(input3, rounding);
103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
109 input0 = _mm_packs_epi32(input0, input2);
110 input1 = _mm_packs_epi32(input1, input3);
113 input2 = _mm_unpacklo_epi16(input0, input1);
114 input3 = _mm_unpackhi_epi16(input0, input1);
115 input0 = _mm_unpacklo_epi32(input2, input3);
116 input1 = _mm_unpackhi_epi32(input2, input3);
118 // Switch column2, column 3, and then, we got:
119 // input2: column1, column 0; input3: column2, column 3.
120 input1 = _mm_shuffle_epi32(input1, 0x4e);
121 input2 = _mm_add_epi16(input0, input1);
122 input3 = _mm_sub_epi16(input0, input1);
124 // Final round and shift
125 input2 = _mm_add_epi16(input2, eight);
126 input3 = _mm_add_epi16(input3, eight);
128 input2 = _mm_srai_epi16(input2, 4);
129 input3 = _mm_srai_epi16(input3, 4);
131 // Reconstruction and Store
133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
135 d0 = _mm_unpacklo_epi32(d0,
136 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
138 *(const int *) (dest + stride * 3)), d2);
139 d0 = _mm_unpacklo_epi8(d0, zero);
140 d2 = _mm_unpacklo_epi8(d2, zero);
141 d0 = _mm_add_epi16(d0, input2);
142 d2 = _mm_add_epi16(d2, input3);
143 d0 = _mm_packus_epi16(d0, d2);
145 *(int *)dest = _mm_cvtsi128_si32(d0);
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
153 d0 = _mm_srli_si128(d0, 4);
154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
160 const __m128i zero = _mm_setzero_si128();
163 a = dct_const_round_shift(input[0] * cospi_16_64);
164 a = dct_const_round_shift(a * cospi_16_64);
165 a = ROUND_POWER_OF_TWO(a, 4);
167 dc_value = _mm_set1_epi16(a);
169 RECON_AND_STORE4X4(dest, dc_value);
170 RECON_AND_STORE4X4(dest, dc_value);
171 RECON_AND_STORE4X4(dest, dc_value);
172 RECON_AND_STORE4X4(dest, dc_value);
175 static INLINE void transpose_4x4(__m128i *res) {
176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
179 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
180 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
183 static void idct4_1d_sse2(__m128i *in) {
184 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
185 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
186 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
187 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
188 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
193 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
194 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
195 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
196 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
197 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
198 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
200 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
201 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
202 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
203 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
205 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
206 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
207 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
208 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
210 u[0] = _mm_packs_epi32(v[0], v[1]);
211 u[1] = _mm_packs_epi32(v[3], v[2]);
214 in[0] = _mm_add_epi16(u[0], u[1]);
215 in[1] = _mm_sub_epi16(u[0], u[1]);
216 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
219 static void iadst4_1d_sse2(__m128i *in) {
220 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
221 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
222 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
223 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
224 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
225 const __m128i kZero = _mm_set1_epi16(0);
226 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
227 __m128i u[8], v[8], in7;
230 in7 = _mm_srli_si128(in[1], 8);
231 in7 = _mm_add_epi16(in7, in[0]);
232 in7 = _mm_sub_epi16(in7, in[1]);
234 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
235 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
236 u[2] = _mm_unpacklo_epi16(in7, kZero);
237 u[3] = _mm_unpackhi_epi16(in[0], kZero);
239 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
240 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
241 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
242 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
243 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
244 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
246 u[0] = _mm_add_epi32(v[0], v[1]);
247 u[1] = _mm_add_epi32(v[3], v[4]);
249 u[3] = _mm_add_epi32(u[0], u[1]);
250 u[4] = _mm_slli_epi32(v[5], 2);
251 u[5] = _mm_add_epi32(u[3], v[5]);
252 u[6] = _mm_sub_epi32(u[5], u[4]);
254 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
255 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
256 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
257 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
259 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
260 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
261 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
262 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
264 in[0] = _mm_packs_epi32(u[0], u[1]);
265 in[1] = _mm_packs_epi32(u[2], u[3]);
268 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i eight = _mm_set1_epi16(8);
274 in[0]= _mm_loadu_si128((const __m128i *)(input));
275 in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
299 // Final round and shift
300 in[0] = _mm_add_epi16(in[0], eight);
301 in[1] = _mm_add_epi16(in[1], eight);
303 in[0] = _mm_srai_epi16(in[0], 4);
304 in[1] = _mm_srai_epi16(in[1], 4);
306 // Reconstruction and Store
308 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
309 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310 d0 = _mm_unpacklo_epi32(d0,
311 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313 *(const int *) (dest + stride * 3)));
314 d0 = _mm_unpacklo_epi8(d0, zero);
315 d2 = _mm_unpacklo_epi8(d2, zero);
316 d0 = _mm_add_epi16(d0, in[0]);
317 d2 = _mm_add_epi16(d2, in[1]);
318 d0 = _mm_packus_epi16(d0, d2);
320 *(int *)dest = _mm_cvtsi128_si32(d0);
322 d0 = _mm_srli_si128(d0, 4);
323 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
325 d0 = _mm_srli_si128(d0, 4);
326 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
328 d0 = _mm_srli_si128(d0, 4);
329 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
334 out0, out1, out2, out3, out4, out5, out6, out7) \
336 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
337 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
338 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
339 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
340 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
341 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
342 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
343 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
345 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
346 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
347 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
348 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
349 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
350 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
351 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
352 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
354 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
355 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
356 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
357 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
358 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
359 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
360 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
361 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
364 #define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
365 out0, out1, out2, out3, out4, out5, out6, out7) \
367 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
368 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
369 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
370 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
372 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
373 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
374 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
375 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
377 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
378 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
379 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
380 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
381 out4 = out5 = out6 = out7 = zero; \
384 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
386 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
387 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
388 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
389 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
391 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
392 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
393 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \
394 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
397 // Define Macro for multiplying elements by constants and adding them together.
398 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
399 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
401 tmp0 = _mm_madd_epi16(lo_0, cst0); \
402 tmp1 = _mm_madd_epi16(hi_0, cst0); \
403 tmp2 = _mm_madd_epi16(lo_0, cst1); \
404 tmp3 = _mm_madd_epi16(hi_0, cst1); \
405 tmp4 = _mm_madd_epi16(lo_1, cst2); \
406 tmp5 = _mm_madd_epi16(hi_1, cst2); \
407 tmp6 = _mm_madd_epi16(lo_1, cst3); \
408 tmp7 = _mm_madd_epi16(hi_1, cst3); \
410 tmp0 = _mm_add_epi32(tmp0, rounding); \
411 tmp1 = _mm_add_epi32(tmp1, rounding); \
412 tmp2 = _mm_add_epi32(tmp2, rounding); \
413 tmp3 = _mm_add_epi32(tmp3, rounding); \
414 tmp4 = _mm_add_epi32(tmp4, rounding); \
415 tmp5 = _mm_add_epi32(tmp5, rounding); \
416 tmp6 = _mm_add_epi32(tmp6, rounding); \
417 tmp7 = _mm_add_epi32(tmp7, rounding); \
419 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
420 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
421 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
422 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
423 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
424 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
425 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
426 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
428 res0 = _mm_packs_epi32(tmp0, tmp1); \
429 res1 = _mm_packs_epi32(tmp2, tmp3); \
430 res2 = _mm_packs_epi32(tmp4, tmp5); \
431 res3 = _mm_packs_epi32(tmp6, tmp7); \
434 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
436 tmp0 = _mm_madd_epi16(lo_0, cst0); \
437 tmp1 = _mm_madd_epi16(hi_0, cst0); \
438 tmp2 = _mm_madd_epi16(lo_0, cst1); \
439 tmp3 = _mm_madd_epi16(hi_0, cst1); \
441 tmp0 = _mm_add_epi32(tmp0, rounding); \
442 tmp1 = _mm_add_epi32(tmp1, rounding); \
443 tmp2 = _mm_add_epi32(tmp2, rounding); \
444 tmp3 = _mm_add_epi32(tmp3, rounding); \
446 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
447 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
448 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
449 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
451 res0 = _mm_packs_epi32(tmp0, tmp1); \
452 res1 = _mm_packs_epi32(tmp2, tmp3); \
458 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
459 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
460 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
461 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
463 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
464 stg1_1, stg1_2, stg1_3, stp1_4, \
465 stp1_7, stp1_5, stp1_6) \
470 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
471 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
472 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
473 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
475 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
476 stg2_1, stg2_2, stg2_3, stp2_0, \
477 stp2_1, stp2_2, stp2_3) \
479 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
480 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
481 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
482 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
487 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
488 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
490 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
491 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
492 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
493 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
495 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
496 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
497 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
498 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
500 tmp0 = _mm_add_epi32(tmp0, rounding); \
501 tmp1 = _mm_add_epi32(tmp1, rounding); \
502 tmp2 = _mm_add_epi32(tmp2, rounding); \
503 tmp3 = _mm_add_epi32(tmp3, rounding); \
505 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
506 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
507 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
508 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
510 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
511 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
515 in0 = _mm_adds_epi16(stp1_0, stp2_7); \
516 in1 = _mm_adds_epi16(stp1_1, stp1_6); \
517 in2 = _mm_adds_epi16(stp1_2, stp1_5); \
518 in3 = _mm_adds_epi16(stp1_3, stp2_4); \
519 in4 = _mm_subs_epi16(stp1_3, stp2_4); \
520 in5 = _mm_subs_epi16(stp1_2, stp1_5); \
521 in6 = _mm_subs_epi16(stp1_1, stp1_6); \
522 in7 = _mm_subs_epi16(stp1_0, stp2_7);
524 #define RECON_AND_STORE(dest, in_x) \
526 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
527 d0 = _mm_unpacklo_epi8(d0, zero); \
528 d0 = _mm_add_epi16(in_x, d0); \
529 d0 = _mm_packus_epi16(d0, d0); \
530 _mm_storel_epi64((__m128i *)(dest), d0); \
534 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
535 const __m128i zero = _mm_setzero_si128();
536 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
537 const __m128i final_rounding = _mm_set1_epi16(1<<4);
538 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
539 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
540 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
541 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
542 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
543 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
544 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
545 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
547 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
548 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
549 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
550 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
554 in0 = _mm_load_si128((const __m128i *)input);
555 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
556 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
557 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
558 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
559 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
560 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
561 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
564 for (i = 0; i < 2; i++) {
565 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
566 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
569 // 4-stage 1D idct8x8
573 // Final rounding and shift
574 in0 = _mm_adds_epi16(in0, final_rounding);
575 in1 = _mm_adds_epi16(in1, final_rounding);
576 in2 = _mm_adds_epi16(in2, final_rounding);
577 in3 = _mm_adds_epi16(in3, final_rounding);
578 in4 = _mm_adds_epi16(in4, final_rounding);
579 in5 = _mm_adds_epi16(in5, final_rounding);
580 in6 = _mm_adds_epi16(in6, final_rounding);
581 in7 = _mm_adds_epi16(in7, final_rounding);
583 in0 = _mm_srai_epi16(in0, 5);
584 in1 = _mm_srai_epi16(in1, 5);
585 in2 = _mm_srai_epi16(in2, 5);
586 in3 = _mm_srai_epi16(in3, 5);
587 in4 = _mm_srai_epi16(in4, 5);
588 in5 = _mm_srai_epi16(in5, 5);
589 in6 = _mm_srai_epi16(in6, 5);
590 in7 = _mm_srai_epi16(in7, 5);
592 RECON_AND_STORE(dest, in0);
593 RECON_AND_STORE(dest, in1);
594 RECON_AND_STORE(dest, in2);
595 RECON_AND_STORE(dest, in3);
596 RECON_AND_STORE(dest, in4);
597 RECON_AND_STORE(dest, in5);
598 RECON_AND_STORE(dest, in6);
599 RECON_AND_STORE(dest, in7);
602 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
604 const __m128i zero = _mm_setzero_si128();
607 a = dct_const_round_shift(input[0] * cospi_16_64);
608 a = dct_const_round_shift(a * cospi_16_64);
609 a = ROUND_POWER_OF_TWO(a, 5);
611 dc_value = _mm_set1_epi16(a);
613 RECON_AND_STORE(dest, dc_value);
614 RECON_AND_STORE(dest, dc_value);
615 RECON_AND_STORE(dest, dc_value);
616 RECON_AND_STORE(dest, dc_value);
617 RECON_AND_STORE(dest, dc_value);
618 RECON_AND_STORE(dest, dc_value);
619 RECON_AND_STORE(dest, dc_value);
620 RECON_AND_STORE(dest, dc_value);
623 // perform 8x8 transpose
624 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
625 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
626 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
627 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
628 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
629 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
630 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
631 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
632 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
634 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
635 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
636 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
637 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
638 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
639 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
640 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
641 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
643 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
644 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
645 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
646 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
647 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
648 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
649 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
650 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
653 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
654 const __m128i zero = _mm_setzero_si128();
655 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
656 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
657 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
658 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
660 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
661 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
662 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
663 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
665 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
666 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
667 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
668 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
669 out[4] = out[5] = out[6] = out[7] = zero;
672 static void idct8_1d_sse2(__m128i *in) {
673 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
674 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
675 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
676 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
677 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
678 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
679 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
680 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
681 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
683 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
684 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
685 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
686 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
697 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
698 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
701 // 4-stage 1D idct8x8
713 static void iadst8_1d_sse2(__m128i *in) {
714 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
715 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
716 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
717 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
718 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
719 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
720 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
721 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
722 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
723 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
724 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
725 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
726 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
727 const __m128i k__const_0 = _mm_set1_epi16(0);
728 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
730 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
731 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
732 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
733 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
734 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
737 array_transpose_8x8(in, in);
739 // properly aligned for butterfly input
749 // column transformation
751 // interleave and multiply/add into 32-bit integer
752 s0 = _mm_unpacklo_epi16(in0, in1);
753 s1 = _mm_unpackhi_epi16(in0, in1);
754 s2 = _mm_unpacklo_epi16(in2, in3);
755 s3 = _mm_unpackhi_epi16(in2, in3);
756 s4 = _mm_unpacklo_epi16(in4, in5);
757 s5 = _mm_unpackhi_epi16(in4, in5);
758 s6 = _mm_unpacklo_epi16(in6, in7);
759 s7 = _mm_unpackhi_epi16(in6, in7);
761 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
762 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
763 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
764 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
765 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
766 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
767 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
768 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
769 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
770 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
771 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
772 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
773 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
774 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
775 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
776 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
779 w0 = _mm_add_epi32(u0, u8);
780 w1 = _mm_add_epi32(u1, u9);
781 w2 = _mm_add_epi32(u2, u10);
782 w3 = _mm_add_epi32(u3, u11);
783 w4 = _mm_add_epi32(u4, u12);
784 w5 = _mm_add_epi32(u5, u13);
785 w6 = _mm_add_epi32(u6, u14);
786 w7 = _mm_add_epi32(u7, u15);
787 w8 = _mm_sub_epi32(u0, u8);
788 w9 = _mm_sub_epi32(u1, u9);
789 w10 = _mm_sub_epi32(u2, u10);
790 w11 = _mm_sub_epi32(u3, u11);
791 w12 = _mm_sub_epi32(u4, u12);
792 w13 = _mm_sub_epi32(u5, u13);
793 w14 = _mm_sub_epi32(u6, u14);
794 w15 = _mm_sub_epi32(u7, u15);
796 // shift and rounding
797 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
798 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
799 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
800 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
801 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
802 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
803 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
804 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
805 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
806 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
807 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
808 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
809 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
810 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
811 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
812 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
814 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
815 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
816 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
817 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
818 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
819 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
820 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
821 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
822 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
823 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
824 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
825 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
826 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
827 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
828 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
829 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
831 // back to 16-bit and pack 8 integers into __m128i
832 in[0] = _mm_packs_epi32(u0, u1);
833 in[1] = _mm_packs_epi32(u2, u3);
834 in[2] = _mm_packs_epi32(u4, u5);
835 in[3] = _mm_packs_epi32(u6, u7);
836 in[4] = _mm_packs_epi32(u8, u9);
837 in[5] = _mm_packs_epi32(u10, u11);
838 in[6] = _mm_packs_epi32(u12, u13);
839 in[7] = _mm_packs_epi32(u14, u15);
842 s0 = _mm_add_epi16(in[0], in[2]);
843 s1 = _mm_add_epi16(in[1], in[3]);
844 s2 = _mm_sub_epi16(in[0], in[2]);
845 s3 = _mm_sub_epi16(in[1], in[3]);
846 u0 = _mm_unpacklo_epi16(in[4], in[5]);
847 u1 = _mm_unpackhi_epi16(in[4], in[5]);
848 u2 = _mm_unpacklo_epi16(in[6], in[7]);
849 u3 = _mm_unpackhi_epi16(in[6], in[7]);
851 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
852 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
853 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
854 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
855 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
856 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
857 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
858 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
860 w0 = _mm_add_epi32(v0, v4);
861 w1 = _mm_add_epi32(v1, v5);
862 w2 = _mm_add_epi32(v2, v6);
863 w3 = _mm_add_epi32(v3, v7);
864 w4 = _mm_sub_epi32(v0, v4);
865 w5 = _mm_sub_epi32(v1, v5);
866 w6 = _mm_sub_epi32(v2, v6);
867 w7 = _mm_sub_epi32(v3, v7);
869 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
870 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
871 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
872 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
873 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
874 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
875 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
876 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
878 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
879 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
880 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
881 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
882 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
883 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
884 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
885 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
887 // back to 16-bit intergers
888 s4 = _mm_packs_epi32(u0, u1);
889 s5 = _mm_packs_epi32(u2, u3);
890 s6 = _mm_packs_epi32(u4, u5);
891 s7 = _mm_packs_epi32(u6, u7);
894 u0 = _mm_unpacklo_epi16(s2, s3);
895 u1 = _mm_unpackhi_epi16(s2, s3);
896 u2 = _mm_unpacklo_epi16(s6, s7);
897 u3 = _mm_unpackhi_epi16(s6, s7);
899 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
900 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
901 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
902 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
903 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
904 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
905 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
906 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
908 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
909 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
910 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
911 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
912 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
913 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
914 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
915 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
917 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
918 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
919 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
920 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
921 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
922 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
923 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
924 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
926 s2 = _mm_packs_epi32(v0, v1);
927 s3 = _mm_packs_epi32(v2, v3);
928 s6 = _mm_packs_epi32(v4, v5);
929 s7 = _mm_packs_epi32(v6, v7);
932 in[1] = _mm_sub_epi16(k__const_0, s4);
934 in[3] = _mm_sub_epi16(k__const_0, s2);
936 in[5] = _mm_sub_epi16(k__const_0, s7);
938 in[7] = _mm_sub_epi16(k__const_0, s1);
942 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
945 const __m128i zero = _mm_setzero_si128();
946 const __m128i final_rounding = _mm_set1_epi16(1<<4);
949 in[0] = _mm_load_si128((const __m128i *)input);
950 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
951 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
952 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
953 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
954 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
955 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
956 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
980 // Final rounding and shift
981 in[0] = _mm_adds_epi16(in[0], final_rounding);
982 in[1] = _mm_adds_epi16(in[1], final_rounding);
983 in[2] = _mm_adds_epi16(in[2], final_rounding);
984 in[3] = _mm_adds_epi16(in[3], final_rounding);
985 in[4] = _mm_adds_epi16(in[4], final_rounding);
986 in[5] = _mm_adds_epi16(in[5], final_rounding);
987 in[6] = _mm_adds_epi16(in[6], final_rounding);
988 in[7] = _mm_adds_epi16(in[7], final_rounding);
990 in[0] = _mm_srai_epi16(in[0], 5);
991 in[1] = _mm_srai_epi16(in[1], 5);
992 in[2] = _mm_srai_epi16(in[2], 5);
993 in[3] = _mm_srai_epi16(in[3], 5);
994 in[4] = _mm_srai_epi16(in[4], 5);
995 in[5] = _mm_srai_epi16(in[5], 5);
996 in[6] = _mm_srai_epi16(in[6], 5);
997 in[7] = _mm_srai_epi16(in[7], 5);
999 RECON_AND_STORE(dest, in[0]);
1000 RECON_AND_STORE(dest, in[1]);
1001 RECON_AND_STORE(dest, in[2]);
1002 RECON_AND_STORE(dest, in[3]);
1003 RECON_AND_STORE(dest, in[4]);
1004 RECON_AND_STORE(dest, in[5]);
1005 RECON_AND_STORE(dest, in[6]);
1006 RECON_AND_STORE(dest, in[7]);
1009 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1010 const __m128i zero = _mm_setzero_si128();
1011 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1012 const __m128i final_rounding = _mm_set1_epi16(1<<4);
1013 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1014 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1015 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1016 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
1017 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1018 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1019 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1020 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1021 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1023 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1024 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
1025 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
1026 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1028 // Rows. Load 4-row input data.
1029 in0 = _mm_load_si128((const __m128i *)input);
1030 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
1031 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
1032 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1035 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
1039 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
1040 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
1042 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
1043 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
1044 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
1045 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
1047 tmp0 = _mm_add_epi32(tmp0, rounding);
1048 tmp2 = _mm_add_epi32(tmp2, rounding);
1049 tmp4 = _mm_add_epi32(tmp4, rounding);
1050 tmp6 = _mm_add_epi32(tmp6, rounding);
1051 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1052 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1053 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1054 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1056 stp1_4 = _mm_packs_epi32(tmp0, zero);
1057 stp1_7 = _mm_packs_epi32(tmp2, zero);
1058 stp1_5 = _mm_packs_epi32(tmp4, zero);
1059 stp1_6 = _mm_packs_epi32(tmp6, zero);
1064 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1065 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1067 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1068 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1069 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1070 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1072 tmp0 = _mm_add_epi32(tmp0, rounding);
1073 tmp2 = _mm_add_epi32(tmp2, rounding);
1074 tmp4 = _mm_add_epi32(tmp4, rounding);
1075 tmp6 = _mm_add_epi32(tmp6, rounding);
1076 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1077 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1078 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1079 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1081 stp2_0 = _mm_packs_epi32(tmp0, zero);
1082 stp2_1 = _mm_packs_epi32(tmp2, zero);
1083 stp2_2 = _mm_packs_epi32(tmp4, zero);
1084 stp2_3 = _mm_packs_epi32(tmp6, zero);
1086 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1087 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1088 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1089 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1094 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1095 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1096 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1097 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1098 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1100 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1101 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
1103 tmp0 = _mm_add_epi32(tmp0, rounding);
1104 tmp2 = _mm_add_epi32(tmp2, rounding);
1105 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1106 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1108 stp1_5 = _mm_packs_epi32(tmp0, zero);
1109 stp1_6 = _mm_packs_epi32(tmp2, zero);
1113 in0 = _mm_adds_epi16(stp1_0, stp2_7);
1114 in1 = _mm_adds_epi16(stp1_1, stp1_6);
1115 in2 = _mm_adds_epi16(stp1_2, stp1_5);
1116 in3 = _mm_adds_epi16(stp1_3, stp2_4);
1117 in4 = _mm_subs_epi16(stp1_3, stp2_4);
1118 in5 = _mm_subs_epi16(stp1_2, stp1_5);
1119 in6 = _mm_subs_epi16(stp1_1, stp1_6);
1120 in7 = _mm_subs_epi16(stp1_0, stp2_7);
1122 // Columns. 4x8 Transpose
1123 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1129 // Final rounding and shift
1130 in0 = _mm_adds_epi16(in0, final_rounding);
1131 in1 = _mm_adds_epi16(in1, final_rounding);
1132 in2 = _mm_adds_epi16(in2, final_rounding);
1133 in3 = _mm_adds_epi16(in3, final_rounding);
1134 in4 = _mm_adds_epi16(in4, final_rounding);
1135 in5 = _mm_adds_epi16(in5, final_rounding);
1136 in6 = _mm_adds_epi16(in6, final_rounding);
1137 in7 = _mm_adds_epi16(in7, final_rounding);
1139 in0 = _mm_srai_epi16(in0, 5);
1140 in1 = _mm_srai_epi16(in1, 5);
1141 in2 = _mm_srai_epi16(in2, 5);
1142 in3 = _mm_srai_epi16(in3, 5);
1143 in4 = _mm_srai_epi16(in4, 5);
1144 in5 = _mm_srai_epi16(in5, 5);
1145 in6 = _mm_srai_epi16(in6, 5);
1146 in7 = _mm_srai_epi16(in7, 5);
1148 RECON_AND_STORE(dest, in0);
1149 RECON_AND_STORE(dest, in1);
1150 RECON_AND_STORE(dest, in2);
1151 RECON_AND_STORE(dest, in3);
1152 RECON_AND_STORE(dest, in4);
1153 RECON_AND_STORE(dest, in5);
1154 RECON_AND_STORE(dest, in6);
1155 RECON_AND_STORE(dest, in7);
1161 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1162 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1163 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
1164 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
1165 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1166 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1167 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1168 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1170 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1171 stg2_0, stg2_1, stg2_2, stg2_3, \
1172 stp2_8, stp2_15, stp2_9, stp2_14) \
1174 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1175 stg2_4, stg2_5, stg2_6, stg2_7, \
1176 stp2_10, stp2_13, stp2_11, stp2_12) \
1181 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1182 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1183 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1184 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1186 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1187 stg3_0, stg3_1, stg3_2, stg3_3, \
1188 stp1_4, stp1_7, stp1_5, stp1_6) \
1190 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
1191 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
1192 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1193 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1195 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1196 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1197 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1198 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1203 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1204 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1205 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1206 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1208 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1209 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1210 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1211 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1213 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1214 stg4_0, stg4_1, stg4_2, stg4_3, \
1215 stp2_0, stp2_1, stp2_2, stp2_3) \
1217 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1218 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1219 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1220 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1222 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1223 stg4_4, stg4_5, stg4_6, stg4_7, \
1224 stp2_9, stp2_14, stp2_10, stp2_13) \
1229 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1230 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1232 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1233 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1234 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1235 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1237 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1238 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1239 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1240 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1242 tmp0 = _mm_add_epi32(tmp0, rounding); \
1243 tmp1 = _mm_add_epi32(tmp1, rounding); \
1244 tmp2 = _mm_add_epi32(tmp2, rounding); \
1245 tmp3 = _mm_add_epi32(tmp3, rounding); \
1247 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1248 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1249 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1250 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1252 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1253 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1255 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1256 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1257 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1258 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1260 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1261 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1262 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1263 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1268 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1269 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1270 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1271 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1273 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1274 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1275 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1276 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1277 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1278 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1279 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1280 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1282 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1283 stg6_0, stg4_0, stg6_0, stg4_0, \
1284 stp2_10, stp2_13, stp2_11, stp2_12) \
1287 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1289 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1290 const __m128i final_rounding = _mm_set1_epi16(1<<5);
1291 const __m128i zero = _mm_setzero_si128();
1293 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1294 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1295 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1296 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1297 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1298 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1299 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1300 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1302 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1303 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1304 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1305 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1307 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1308 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1309 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1310 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1311 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1312 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1313 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1314 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1316 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1318 __m128i in[16], l[16], r[16], *curr1;
1319 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1320 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1321 stp1_8_0, stp1_12_0;
1322 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1323 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1324 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1328 for (i = 0; i < 2; i++) {
1332 in[0] = _mm_load_si128((const __m128i *)input);
1333 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1334 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1335 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1336 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1337 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1338 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1339 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1340 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1341 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1342 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1343 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1344 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1345 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1346 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1347 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1349 array_transpose_8x8(in, in);
1350 array_transpose_8x8(in+8, in+8);
1355 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1356 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1357 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1358 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1359 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1360 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1361 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1362 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1363 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1364 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1365 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1366 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1367 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1368 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1369 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1370 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1375 for (i = 0; i < 2; i++) {
1377 array_transpose_8x8(l+i*8, in);
1378 array_transpose_8x8(r+i*8, in+8);
1383 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1384 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1385 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1386 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1387 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1388 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1389 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1390 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1391 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1392 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1393 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1394 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1395 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1396 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1397 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1398 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1400 // Final rounding and shift
1401 in[0] = _mm_adds_epi16(in[0], final_rounding);
1402 in[1] = _mm_adds_epi16(in[1], final_rounding);
1403 in[2] = _mm_adds_epi16(in[2], final_rounding);
1404 in[3] = _mm_adds_epi16(in[3], final_rounding);
1405 in[4] = _mm_adds_epi16(in[4], final_rounding);
1406 in[5] = _mm_adds_epi16(in[5], final_rounding);
1407 in[6] = _mm_adds_epi16(in[6], final_rounding);
1408 in[7] = _mm_adds_epi16(in[7], final_rounding);
1409 in[8] = _mm_adds_epi16(in[8], final_rounding);
1410 in[9] = _mm_adds_epi16(in[9], final_rounding);
1411 in[10] = _mm_adds_epi16(in[10], final_rounding);
1412 in[11] = _mm_adds_epi16(in[11], final_rounding);
1413 in[12] = _mm_adds_epi16(in[12], final_rounding);
1414 in[13] = _mm_adds_epi16(in[13], final_rounding);
1415 in[14] = _mm_adds_epi16(in[14], final_rounding);
1416 in[15] = _mm_adds_epi16(in[15], final_rounding);
1418 in[0] = _mm_srai_epi16(in[0], 6);
1419 in[1] = _mm_srai_epi16(in[1], 6);
1420 in[2] = _mm_srai_epi16(in[2], 6);
1421 in[3] = _mm_srai_epi16(in[3], 6);
1422 in[4] = _mm_srai_epi16(in[4], 6);
1423 in[5] = _mm_srai_epi16(in[5], 6);
1424 in[6] = _mm_srai_epi16(in[6], 6);
1425 in[7] = _mm_srai_epi16(in[7], 6);
1426 in[8] = _mm_srai_epi16(in[8], 6);
1427 in[9] = _mm_srai_epi16(in[9], 6);
1428 in[10] = _mm_srai_epi16(in[10], 6);
1429 in[11] = _mm_srai_epi16(in[11], 6);
1430 in[12] = _mm_srai_epi16(in[12], 6);
1431 in[13] = _mm_srai_epi16(in[13], 6);
1432 in[14] = _mm_srai_epi16(in[14], 6);
1433 in[15] = _mm_srai_epi16(in[15], 6);
1435 RECON_AND_STORE(dest, in[0]);
1436 RECON_AND_STORE(dest, in[1]);
1437 RECON_AND_STORE(dest, in[2]);
1438 RECON_AND_STORE(dest, in[3]);
1439 RECON_AND_STORE(dest, in[4]);
1440 RECON_AND_STORE(dest, in[5]);
1441 RECON_AND_STORE(dest, in[6]);
1442 RECON_AND_STORE(dest, in[7]);
1443 RECON_AND_STORE(dest, in[8]);
1444 RECON_AND_STORE(dest, in[9]);
1445 RECON_AND_STORE(dest, in[10]);
1446 RECON_AND_STORE(dest, in[11]);
1447 RECON_AND_STORE(dest, in[12]);
1448 RECON_AND_STORE(dest, in[13]);
1449 RECON_AND_STORE(dest, in[14]);
1450 RECON_AND_STORE(dest, in[15]);
1452 dest += 8 - (stride * 16);
1456 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1458 const __m128i zero = _mm_setzero_si128();
1461 a = dct_const_round_shift(input[0] * cospi_16_64);
1462 a = dct_const_round_shift(a * cospi_16_64);
1463 a = ROUND_POWER_OF_TWO(a, 6);
1465 dc_value = _mm_set1_epi16(a);
1467 for (i = 0; i < 2; ++i) {
1468 RECON_AND_STORE(dest, dc_value);
1469 RECON_AND_STORE(dest, dc_value);
1470 RECON_AND_STORE(dest, dc_value);
1471 RECON_AND_STORE(dest, dc_value);
1472 RECON_AND_STORE(dest, dc_value);
1473 RECON_AND_STORE(dest, dc_value);
1474 RECON_AND_STORE(dest, dc_value);
1475 RECON_AND_STORE(dest, dc_value);
1476 RECON_AND_STORE(dest, dc_value);
1477 RECON_AND_STORE(dest, dc_value);
1478 RECON_AND_STORE(dest, dc_value);
1479 RECON_AND_STORE(dest, dc_value);
1480 RECON_AND_STORE(dest, dc_value);
1481 RECON_AND_STORE(dest, dc_value);
1482 RECON_AND_STORE(dest, dc_value);
1483 RECON_AND_STORE(dest, dc_value);
1484 dest += 8 - (stride * 16);
1488 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1490 array_transpose_8x8(res0, res0);
1491 array_transpose_8x8(res1, tbuf);
1492 array_transpose_8x8(res0 + 8, res1);
1493 array_transpose_8x8(res1 + 8, res1 + 8);
1505 static void iadst16_1d_8col(__m128i *in) {
1506 // perform 16x16 1-D ADST for 8 columns
1507 __m128i s[16], x[16], u[32], v[32];
1508 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1509 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1510 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1511 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1512 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1513 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1514 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1515 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1516 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1517 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1518 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1519 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1520 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1521 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1522 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1523 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1524 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1525 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1526 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1527 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1528 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1529 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1530 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1531 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1532 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1533 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1534 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1535 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1536 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1537 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1538 const __m128i kZero = _mm_set1_epi16(0);
1540 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1541 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1542 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1543 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1544 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1545 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1546 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1547 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1548 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1549 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1550 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1551 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1552 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1553 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1554 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1555 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1557 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1558 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1559 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1560 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1561 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1562 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1563 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1564 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1565 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1566 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1567 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1568 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1569 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1570 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1571 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1572 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1573 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1574 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1575 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1576 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1577 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1578 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1579 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1580 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1581 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1582 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1583 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1584 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1585 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1586 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1587 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1588 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1590 u[0] = _mm_add_epi32(v[0], v[16]);
1591 u[1] = _mm_add_epi32(v[1], v[17]);
1592 u[2] = _mm_add_epi32(v[2], v[18]);
1593 u[3] = _mm_add_epi32(v[3], v[19]);
1594 u[4] = _mm_add_epi32(v[4], v[20]);
1595 u[5] = _mm_add_epi32(v[5], v[21]);
1596 u[6] = _mm_add_epi32(v[6], v[22]);
1597 u[7] = _mm_add_epi32(v[7], v[23]);
1598 u[8] = _mm_add_epi32(v[8], v[24]);
1599 u[9] = _mm_add_epi32(v[9], v[25]);
1600 u[10] = _mm_add_epi32(v[10], v[26]);
1601 u[11] = _mm_add_epi32(v[11], v[27]);
1602 u[12] = _mm_add_epi32(v[12], v[28]);
1603 u[13] = _mm_add_epi32(v[13], v[29]);
1604 u[14] = _mm_add_epi32(v[14], v[30]);
1605 u[15] = _mm_add_epi32(v[15], v[31]);
1606 u[16] = _mm_sub_epi32(v[0], v[16]);
1607 u[17] = _mm_sub_epi32(v[1], v[17]);
1608 u[18] = _mm_sub_epi32(v[2], v[18]);
1609 u[19] = _mm_sub_epi32(v[3], v[19]);
1610 u[20] = _mm_sub_epi32(v[4], v[20]);
1611 u[21] = _mm_sub_epi32(v[5], v[21]);
1612 u[22] = _mm_sub_epi32(v[6], v[22]);
1613 u[23] = _mm_sub_epi32(v[7], v[23]);
1614 u[24] = _mm_sub_epi32(v[8], v[24]);
1615 u[25] = _mm_sub_epi32(v[9], v[25]);
1616 u[26] = _mm_sub_epi32(v[10], v[26]);
1617 u[27] = _mm_sub_epi32(v[11], v[27]);
1618 u[28] = _mm_sub_epi32(v[12], v[28]);
1619 u[29] = _mm_sub_epi32(v[13], v[29]);
1620 u[30] = _mm_sub_epi32(v[14], v[30]);
1621 u[31] = _mm_sub_epi32(v[15], v[31]);
1623 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1624 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1625 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1626 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1627 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1628 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1629 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1630 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1631 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1632 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1633 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1634 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1635 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1636 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1637 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1638 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1639 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1640 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1641 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1642 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1643 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1644 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1645 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1646 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1647 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1648 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1649 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1650 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1651 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1652 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1653 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1654 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1656 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1657 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1658 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1659 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1660 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1661 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1662 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1663 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1664 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1665 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1666 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1667 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1668 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1669 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1670 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1671 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1672 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1673 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1674 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1675 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1676 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1677 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1678 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1679 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1680 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1681 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1682 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1683 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1684 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1685 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1686 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1687 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1689 s[0] = _mm_packs_epi32(u[0], u[1]);
1690 s[1] = _mm_packs_epi32(u[2], u[3]);
1691 s[2] = _mm_packs_epi32(u[4], u[5]);
1692 s[3] = _mm_packs_epi32(u[6], u[7]);
1693 s[4] = _mm_packs_epi32(u[8], u[9]);
1694 s[5] = _mm_packs_epi32(u[10], u[11]);
1695 s[6] = _mm_packs_epi32(u[12], u[13]);
1696 s[7] = _mm_packs_epi32(u[14], u[15]);
1697 s[8] = _mm_packs_epi32(u[16], u[17]);
1698 s[9] = _mm_packs_epi32(u[18], u[19]);
1699 s[10] = _mm_packs_epi32(u[20], u[21]);
1700 s[11] = _mm_packs_epi32(u[22], u[23]);
1701 s[12] = _mm_packs_epi32(u[24], u[25]);
1702 s[13] = _mm_packs_epi32(u[26], u[27]);
1703 s[14] = _mm_packs_epi32(u[28], u[29]);
1704 s[15] = _mm_packs_epi32(u[30], u[31]);
1707 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1708 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1709 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1710 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1711 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1712 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1713 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1714 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1716 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1717 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1718 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1719 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1720 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1721 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1722 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1723 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1724 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1725 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1726 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1727 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1728 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1729 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1730 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1731 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1733 u[0] = _mm_add_epi32(v[0], v[8]);
1734 u[1] = _mm_add_epi32(v[1], v[9]);
1735 u[2] = _mm_add_epi32(v[2], v[10]);
1736 u[3] = _mm_add_epi32(v[3], v[11]);
1737 u[4] = _mm_add_epi32(v[4], v[12]);
1738 u[5] = _mm_add_epi32(v[5], v[13]);
1739 u[6] = _mm_add_epi32(v[6], v[14]);
1740 u[7] = _mm_add_epi32(v[7], v[15]);
1741 u[8] = _mm_sub_epi32(v[0], v[8]);
1742 u[9] = _mm_sub_epi32(v[1], v[9]);
1743 u[10] = _mm_sub_epi32(v[2], v[10]);
1744 u[11] = _mm_sub_epi32(v[3], v[11]);
1745 u[12] = _mm_sub_epi32(v[4], v[12]);
1746 u[13] = _mm_sub_epi32(v[5], v[13]);
1747 u[14] = _mm_sub_epi32(v[6], v[14]);
1748 u[15] = _mm_sub_epi32(v[7], v[15]);
1750 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1751 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1752 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1753 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1754 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1755 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1756 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1757 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1758 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1759 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1760 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1761 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1762 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1763 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1764 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1765 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1767 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1768 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1769 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1770 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1771 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1772 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1773 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1774 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1775 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1776 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1777 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1778 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1779 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1780 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1781 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1782 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1784 x[0] = _mm_add_epi16(s[0], s[4]);
1785 x[1] = _mm_add_epi16(s[1], s[5]);
1786 x[2] = _mm_add_epi16(s[2], s[6]);
1787 x[3] = _mm_add_epi16(s[3], s[7]);
1788 x[4] = _mm_sub_epi16(s[0], s[4]);
1789 x[5] = _mm_sub_epi16(s[1], s[5]);
1790 x[6] = _mm_sub_epi16(s[2], s[6]);
1791 x[7] = _mm_sub_epi16(s[3], s[7]);
1792 x[8] = _mm_packs_epi32(u[0], u[1]);
1793 x[9] = _mm_packs_epi32(u[2], u[3]);
1794 x[10] = _mm_packs_epi32(u[4], u[5]);
1795 x[11] = _mm_packs_epi32(u[6], u[7]);
1796 x[12] = _mm_packs_epi32(u[8], u[9]);
1797 x[13] = _mm_packs_epi32(u[10], u[11]);
1798 x[14] = _mm_packs_epi32(u[12], u[13]);
1799 x[15] = _mm_packs_epi32(u[14], u[15]);
1802 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1803 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1804 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1805 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1806 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1807 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1808 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1809 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1811 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1812 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1813 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1814 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1815 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1816 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1817 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1818 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1819 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1820 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1821 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1822 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1823 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1824 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1825 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1826 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1828 u[0] = _mm_add_epi32(v[0], v[4]);
1829 u[1] = _mm_add_epi32(v[1], v[5]);
1830 u[2] = _mm_add_epi32(v[2], v[6]);
1831 u[3] = _mm_add_epi32(v[3], v[7]);
1832 u[4] = _mm_sub_epi32(v[0], v[4]);
1833 u[5] = _mm_sub_epi32(v[1], v[5]);
1834 u[6] = _mm_sub_epi32(v[2], v[6]);
1835 u[7] = _mm_sub_epi32(v[3], v[7]);
1836 u[8] = _mm_add_epi32(v[8], v[12]);
1837 u[9] = _mm_add_epi32(v[9], v[13]);
1838 u[10] = _mm_add_epi32(v[10], v[14]);
1839 u[11] = _mm_add_epi32(v[11], v[15]);
1840 u[12] = _mm_sub_epi32(v[8], v[12]);
1841 u[13] = _mm_sub_epi32(v[9], v[13]);
1842 u[14] = _mm_sub_epi32(v[10], v[14]);
1843 u[15] = _mm_sub_epi32(v[11], v[15]);
1845 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1846 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1847 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1848 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1849 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1850 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1851 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1852 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1853 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1854 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1855 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1856 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1857 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1858 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1859 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1860 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1862 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1863 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1864 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1865 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1866 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1867 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1868 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1869 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1870 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1871 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1872 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1873 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1874 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1875 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1876 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1877 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1879 s[0] = _mm_add_epi16(x[0], x[2]);
1880 s[1] = _mm_add_epi16(x[1], x[3]);
1881 s[2] = _mm_sub_epi16(x[0], x[2]);
1882 s[3] = _mm_sub_epi16(x[1], x[3]);
1883 s[4] = _mm_packs_epi32(v[0], v[1]);
1884 s[5] = _mm_packs_epi32(v[2], v[3]);
1885 s[6] = _mm_packs_epi32(v[4], v[5]);
1886 s[7] = _mm_packs_epi32(v[6], v[7]);
1887 s[8] = _mm_add_epi16(x[8], x[10]);
1888 s[9] = _mm_add_epi16(x[9], x[11]);
1889 s[10] = _mm_sub_epi16(x[8], x[10]);
1890 s[11] = _mm_sub_epi16(x[9], x[11]);
1891 s[12] = _mm_packs_epi32(v[8], v[9]);
1892 s[13] = _mm_packs_epi32(v[10], v[11]);
1893 s[14] = _mm_packs_epi32(v[12], v[13]);
1894 s[15] = _mm_packs_epi32(v[14], v[15]);
1897 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1898 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1899 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1900 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1901 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1902 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1903 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1904 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1906 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1907 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1908 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1909 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1910 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1911 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1912 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1913 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1914 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1915 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1916 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1917 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1918 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1919 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1920 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1921 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1923 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1924 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1925 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1926 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1927 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1928 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1929 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1930 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1931 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1932 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1933 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1934 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1935 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1936 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1937 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1938 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1940 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1941 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1942 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1943 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1944 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1945 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1946 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1947 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1948 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1949 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1950 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1951 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1952 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1953 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1954 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1955 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1958 in[1] = _mm_sub_epi16(kZero, s[8]);
1960 in[3] = _mm_sub_epi16(kZero, s[4]);
1961 in[4] = _mm_packs_epi32(v[4], v[5]);
1962 in[5] = _mm_packs_epi32(v[12], v[13]);
1963 in[6] = _mm_packs_epi32(v[8], v[9]);
1964 in[7] = _mm_packs_epi32(v[0], v[1]);
1965 in[8] = _mm_packs_epi32(v[2], v[3]);
1966 in[9] = _mm_packs_epi32(v[10], v[11]);
1967 in[10] = _mm_packs_epi32(v[14], v[15]);
1968 in[11] = _mm_packs_epi32(v[6], v[7]);
1970 in[13] = _mm_sub_epi16(kZero, s[13]);
1972 in[15] = _mm_sub_epi16(kZero, s[1]);
1975 static void idct16_1d_8col(__m128i *in) {
1976 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1977 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1978 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1979 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1980 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1981 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1982 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1983 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1984 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1985 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1986 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1987 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1988 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1989 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1990 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1991 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1992 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1993 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1994 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1995 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1996 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1997 __m128i v[16], u[16], s[16], t[16];
2018 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2019 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2020 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2021 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2022 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2023 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2024 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2025 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2027 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2028 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2029 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2030 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2031 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2032 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2033 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2034 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2035 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2036 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2037 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2038 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2039 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2040 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2041 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2042 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2044 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2045 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2046 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2047 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2048 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2049 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2050 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2051 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2052 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2053 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2054 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2055 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2056 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2057 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2058 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2059 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2061 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2062 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2063 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2064 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2065 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2066 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2067 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2068 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2069 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2070 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2071 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2072 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2073 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2074 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2075 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2076 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2078 s[8] = _mm_packs_epi32(u[0], u[1]);
2079 s[15] = _mm_packs_epi32(u[2], u[3]);
2080 s[9] = _mm_packs_epi32(u[4], u[5]);
2081 s[14] = _mm_packs_epi32(u[6], u[7]);
2082 s[10] = _mm_packs_epi32(u[8], u[9]);
2083 s[13] = _mm_packs_epi32(u[10], u[11]);
2084 s[11] = _mm_packs_epi32(u[12], u[13]);
2085 s[12] = _mm_packs_epi32(u[14], u[15]);
2092 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2093 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2094 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2095 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2097 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2098 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2099 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2100 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2101 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2102 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2103 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2104 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2106 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2107 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2108 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2109 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2110 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2111 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2112 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2113 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2115 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2116 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2117 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2118 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2119 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2120 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2121 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2122 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2124 t[4] = _mm_packs_epi32(u[0], u[1]);
2125 t[7] = _mm_packs_epi32(u[2], u[3]);
2126 t[5] = _mm_packs_epi32(u[4], u[5]);
2127 t[6] = _mm_packs_epi32(u[6], u[7]);
2128 t[8] = _mm_add_epi16(s[8], s[9]);
2129 t[9] = _mm_sub_epi16(s[8], s[9]);
2130 t[10] = _mm_sub_epi16(s[11], s[10]);
2131 t[11] = _mm_add_epi16(s[10], s[11]);
2132 t[12] = _mm_add_epi16(s[12], s[13]);
2133 t[13] = _mm_sub_epi16(s[12], s[13]);
2134 t[14] = _mm_sub_epi16(s[15], s[14]);
2135 t[15] = _mm_add_epi16(s[14], s[15]);
2138 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2139 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2140 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2141 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2142 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2143 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2144 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2145 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2147 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2148 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2149 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2150 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2151 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2152 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2153 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2154 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2155 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2156 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2157 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2158 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2159 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2160 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2161 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2162 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2164 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2165 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2166 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2167 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2168 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2169 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2170 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2171 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2172 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2173 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2174 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2175 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2176 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2177 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2178 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2179 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2181 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2182 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2183 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2184 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2185 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2186 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2187 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2188 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2189 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2190 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2191 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2192 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2193 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2194 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2195 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2196 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2198 s[0] = _mm_packs_epi32(u[0], u[1]);
2199 s[1] = _mm_packs_epi32(u[2], u[3]);
2200 s[2] = _mm_packs_epi32(u[4], u[5]);
2201 s[3] = _mm_packs_epi32(u[6], u[7]);
2202 s[4] = _mm_add_epi16(t[4], t[5]);
2203 s[5] = _mm_sub_epi16(t[4], t[5]);
2204 s[6] = _mm_sub_epi16(t[7], t[6]);
2205 s[7] = _mm_add_epi16(t[6], t[7]);
2208 s[9] = _mm_packs_epi32(u[8], u[9]);
2209 s[14] = _mm_packs_epi32(u[10], u[11]);
2210 s[10] = _mm_packs_epi32(u[12], u[13]);
2211 s[13] = _mm_packs_epi32(u[14], u[15]);
2216 t[0] = _mm_add_epi16(s[0], s[3]);
2217 t[1] = _mm_add_epi16(s[1], s[2]);
2218 t[2] = _mm_sub_epi16(s[1], s[2]);
2219 t[3] = _mm_sub_epi16(s[0], s[3]);
2223 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2224 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2225 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2226 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2227 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2228 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2229 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2230 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2231 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2232 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2233 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2234 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2235 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2236 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2237 t[5] = _mm_packs_epi32(u[0], u[1]);
2238 t[6] = _mm_packs_epi32(u[2], u[3]);
2240 t[8] = _mm_add_epi16(s[8], s[11]);
2241 t[9] = _mm_add_epi16(s[9], s[10]);
2242 t[10] = _mm_sub_epi16(s[9], s[10]);
2243 t[11] = _mm_sub_epi16(s[8], s[11]);
2244 t[12] = _mm_sub_epi16(s[15], s[12]);
2245 t[13] = _mm_sub_epi16(s[14], s[13]);
2246 t[14] = _mm_add_epi16(s[13], s[14]);
2247 t[15] = _mm_add_epi16(s[12], s[15]);
2250 s[0] = _mm_add_epi16(t[0], t[7]);
2251 s[1] = _mm_add_epi16(t[1], t[6]);
2252 s[2] = _mm_add_epi16(t[2], t[5]);
2253 s[3] = _mm_add_epi16(t[3], t[4]);
2254 s[4] = _mm_sub_epi16(t[3], t[4]);
2255 s[5] = _mm_sub_epi16(t[2], t[5]);
2256 s[6] = _mm_sub_epi16(t[1], t[6]);
2257 s[7] = _mm_sub_epi16(t[0], t[7]);
2261 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2262 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2263 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2264 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2266 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2267 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2268 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2269 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2270 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2271 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2272 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2273 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2275 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2276 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2277 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2278 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2279 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2280 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2281 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2282 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2284 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2285 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2286 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2287 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2288 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2289 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2290 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2291 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2293 s[10] = _mm_packs_epi32(u[0], u[1]);
2294 s[13] = _mm_packs_epi32(u[2], u[3]);
2295 s[11] = _mm_packs_epi32(u[4], u[5]);
2296 s[12] = _mm_packs_epi32(u[6], u[7]);
2301 in[0] = _mm_add_epi16(s[0], s[15]);
2302 in[1] = _mm_add_epi16(s[1], s[14]);
2303 in[2] = _mm_add_epi16(s[2], s[13]);
2304 in[3] = _mm_add_epi16(s[3], s[12]);
2305 in[4] = _mm_add_epi16(s[4], s[11]);
2306 in[5] = _mm_add_epi16(s[5], s[10]);
2307 in[6] = _mm_add_epi16(s[6], s[9]);
2308 in[7] = _mm_add_epi16(s[7], s[8]);
2309 in[8] = _mm_sub_epi16(s[7], s[8]);
2310 in[9] = _mm_sub_epi16(s[6], s[9]);
2311 in[10] = _mm_sub_epi16(s[5], s[10]);
2312 in[11] = _mm_sub_epi16(s[4], s[11]);
2313 in[12] = _mm_sub_epi16(s[3], s[12]);
2314 in[13] = _mm_sub_epi16(s[2], s[13]);
2315 in[14] = _mm_sub_epi16(s[1], s[14]);
2316 in[15] = _mm_sub_epi16(s[0], s[15]);
2319 static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
2320 array_transpose_16x16(in0, in1);
2321 idct16_1d_8col(in0);
2322 idct16_1d_8col(in1);
2325 static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
2326 array_transpose_16x16(in0, in1);
2327 iadst16_1d_8col(in0);
2328 iadst16_1d_8col(in1);
2331 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
2332 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
2333 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
2334 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
2335 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
2336 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
2337 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
2338 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
2339 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
2341 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
2342 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
2343 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
2344 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
2345 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
2346 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
2347 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
2348 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
2351 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
2352 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2353 const __m128i zero = _mm_setzero_si128();
2354 // Final rounding and shift
2355 in[0] = _mm_adds_epi16(in[0], final_rounding);
2356 in[1] = _mm_adds_epi16(in[1], final_rounding);
2357 in[2] = _mm_adds_epi16(in[2], final_rounding);
2358 in[3] = _mm_adds_epi16(in[3], final_rounding);
2359 in[4] = _mm_adds_epi16(in[4], final_rounding);
2360 in[5] = _mm_adds_epi16(in[5], final_rounding);
2361 in[6] = _mm_adds_epi16(in[6], final_rounding);
2362 in[7] = _mm_adds_epi16(in[7], final_rounding);
2363 in[8] = _mm_adds_epi16(in[8], final_rounding);
2364 in[9] = _mm_adds_epi16(in[9], final_rounding);
2365 in[10] = _mm_adds_epi16(in[10], final_rounding);
2366 in[11] = _mm_adds_epi16(in[11], final_rounding);
2367 in[12] = _mm_adds_epi16(in[12], final_rounding);
2368 in[13] = _mm_adds_epi16(in[13], final_rounding);
2369 in[14] = _mm_adds_epi16(in[14], final_rounding);
2370 in[15] = _mm_adds_epi16(in[15], final_rounding);
2372 in[0] = _mm_srai_epi16(in[0], 6);
2373 in[1] = _mm_srai_epi16(in[1], 6);
2374 in[2] = _mm_srai_epi16(in[2], 6);
2375 in[3] = _mm_srai_epi16(in[3], 6);
2376 in[4] = _mm_srai_epi16(in[4], 6);
2377 in[5] = _mm_srai_epi16(in[5], 6);
2378 in[6] = _mm_srai_epi16(in[6], 6);
2379 in[7] = _mm_srai_epi16(in[7], 6);
2380 in[8] = _mm_srai_epi16(in[8], 6);
2381 in[9] = _mm_srai_epi16(in[9], 6);
2382 in[10] = _mm_srai_epi16(in[10], 6);
2383 in[11] = _mm_srai_epi16(in[11], 6);
2384 in[12] = _mm_srai_epi16(in[12], 6);
2385 in[13] = _mm_srai_epi16(in[13], 6);
2386 in[14] = _mm_srai_epi16(in[14], 6);
2387 in[15] = _mm_srai_epi16(in[15], 6);
2389 RECON_AND_STORE(dest, in[0]);
2390 RECON_AND_STORE(dest, in[1]);
2391 RECON_AND_STORE(dest, in[2]);
2392 RECON_AND_STORE(dest, in[3]);
2393 RECON_AND_STORE(dest, in[4]);
2394 RECON_AND_STORE(dest, in[5]);
2395 RECON_AND_STORE(dest, in[6]);
2396 RECON_AND_STORE(dest, in[7]);
2397 RECON_AND_STORE(dest, in[8]);
2398 RECON_AND_STORE(dest, in[9]);
2399 RECON_AND_STORE(dest, in[10]);
2400 RECON_AND_STORE(dest, in[11]);
2401 RECON_AND_STORE(dest, in[12]);
2402 RECON_AND_STORE(dest, in[13]);
2403 RECON_AND_STORE(dest, in[14]);
2404 RECON_AND_STORE(dest, in[15]);
2407 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2409 __m128i in0[16], in1[16];
2411 load_buffer_8x16(input, in0);
2413 load_buffer_8x16(input, in1);
2417 idct16_1d_sse2(in0, in1);
2418 idct16_1d_sse2(in0, in1);
2421 idct16_1d_sse2(in0, in1);
2422 iadst16_1d_sse2(in0, in1);
2425 iadst16_1d_sse2(in0, in1);
2426 idct16_1d_sse2(in0, in1);
2428 case 3: // ADST_ADST
2429 iadst16_1d_sse2(in0, in1);
2430 iadst16_1d_sse2(in0, in1);
2437 write_buffer_8x16(dest, in0, stride);
2439 write_buffer_8x16(dest, in1, stride);
2442 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2444 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2445 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2446 const __m128i zero = _mm_setzero_si128();
2448 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2449 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2450 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2451 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2452 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2453 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2454 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2455 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2457 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2458 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2459 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2460 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2462 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2463 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2464 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2465 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2466 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2467 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2468 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2469 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2471 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2472 __m128i in[16], l[16];
2473 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2474 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2475 stp1_8_0, stp1_12_0;
2476 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2477 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2478 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2480 in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
2481 // 1-D idct. Load input data.
2482 in[0] = _mm_load_si128((const __m128i *)input);
2483 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
2484 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2485 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
2486 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2487 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
2488 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2489 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
2491 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
2492 TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
2496 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
2497 const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
2498 const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
2499 const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
2501 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2502 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2503 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2504 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2505 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2506 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2507 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2508 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2510 tmp0 = _mm_add_epi32(tmp0, rounding);
2511 tmp2 = _mm_add_epi32(tmp2, rounding);
2512 tmp4 = _mm_add_epi32(tmp4, rounding);
2513 tmp6 = _mm_add_epi32(tmp6, rounding);
2514 tmp1 = _mm_add_epi32(tmp1, rounding);
2515 tmp3 = _mm_add_epi32(tmp3, rounding);
2516 tmp5 = _mm_add_epi32(tmp5, rounding);
2517 tmp7 = _mm_add_epi32(tmp7, rounding);
2519 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2520 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2521 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2522 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2523 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2524 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2525 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2526 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2528 stp2_8 = _mm_packs_epi32(tmp0, zero);
2529 stp2_15 = _mm_packs_epi32(tmp2, zero);
2530 stp2_9 = _mm_packs_epi32(tmp4, zero);
2531 stp2_14 = _mm_packs_epi32(tmp6, zero);
2533 stp2_10 = _mm_packs_epi32(tmp1, zero);
2534 stp2_13 = _mm_packs_epi32(tmp3, zero);
2535 stp2_11 = _mm_packs_epi32(tmp5, zero);
2536 stp2_12 = _mm_packs_epi32(tmp7, zero);
2541 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
2542 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
2544 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2545 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2546 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2547 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2549 tmp0 = _mm_add_epi32(tmp0, rounding);
2550 tmp2 = _mm_add_epi32(tmp2, rounding);
2551 tmp4 = _mm_add_epi32(tmp4, rounding);
2552 tmp6 = _mm_add_epi32(tmp6, rounding);
2554 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2555 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2556 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2557 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2559 stp1_4 = _mm_packs_epi32(tmp0, zero);
2560 stp1_7 = _mm_packs_epi32(tmp2, zero);
2561 stp1_5 = _mm_packs_epi32(tmp4, zero);
2562 stp1_6 = _mm_packs_epi32(tmp6, zero);
2564 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2565 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2566 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2567 stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2569 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2570 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2571 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2572 stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2577 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
2578 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
2579 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2580 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2582 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2583 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2584 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2585 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2586 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2587 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2588 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2589 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2591 tmp0 = _mm_add_epi32(tmp0, rounding);
2592 tmp2 = _mm_add_epi32(tmp2, rounding);
2593 tmp4 = _mm_add_epi32(tmp4, rounding);
2594 tmp6 = _mm_add_epi32(tmp6, rounding);
2595 tmp1 = _mm_add_epi32(tmp1, rounding);
2596 tmp3 = _mm_add_epi32(tmp3, rounding);
2597 tmp5 = _mm_add_epi32(tmp5, rounding);
2598 tmp7 = _mm_add_epi32(tmp7, rounding);
2600 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2601 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2602 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2603 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2604 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2605 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2606 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2607 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2609 stp2_0 = _mm_packs_epi32(tmp0, zero);
2610 stp2_1 = _mm_packs_epi32(tmp2, zero);
2611 stp2_2 = _mm_packs_epi32(tmp4, zero);
2612 stp2_3 = _mm_packs_epi32(tmp6, zero);
2613 stp2_9 = _mm_packs_epi32(tmp1, zero);
2614 stp2_14 = _mm_packs_epi32(tmp3, zero);
2615 stp2_10 = _mm_packs_epi32(tmp5, zero);
2616 stp2_13 = _mm_packs_epi32(tmp7, zero);
2618 stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2619 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2620 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2621 stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2624 // Stage5 and Stage6
2626 stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2627 stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2628 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2629 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2631 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2632 stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2633 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2634 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2636 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2637 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2638 stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2639 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2644 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2645 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2646 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2648 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2649 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2650 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2651 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2652 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2653 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2655 tmp1 = _mm_add_epi32(tmp1, rounding);
2656 tmp3 = _mm_add_epi32(tmp3, rounding);
2657 tmp0 = _mm_add_epi32(tmp0, rounding);
2658 tmp2 = _mm_add_epi32(tmp2, rounding);
2659 tmp4 = _mm_add_epi32(tmp4, rounding);
2660 tmp6 = _mm_add_epi32(tmp6, rounding);
2662 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2663 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2664 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2665 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2666 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2667 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2669 stp1_5 = _mm_packs_epi32(tmp1, zero);
2670 stp1_6 = _mm_packs_epi32(tmp3, zero);
2671 stp2_10 = _mm_packs_epi32(tmp0, zero);
2672 stp2_13 = _mm_packs_epi32(tmp2, zero);
2673 stp2_11 = _mm_packs_epi32(tmp4, zero);
2674 stp2_12 = _mm_packs_epi32(tmp6, zero);
2676 stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2677 stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2678 stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2679 stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2680 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2681 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2682 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2683 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2686 // Stage7. Left 8x16 only.
2687 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2688 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2689 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2690 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2691 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2692 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2693 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2694 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2695 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2696 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2697 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2698 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2699 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2700 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2701 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2702 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2704 // 2-D idct. We do 2 8x16 blocks.
2705 for (i = 0; i < 2; i++) {
2706 array_transpose_4X8(l + 8*i, in);
2707 in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
2712 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2713 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2714 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2715 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2716 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2717 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2718 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2719 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2720 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2721 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2722 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2723 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2724 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2725 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2726 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2727 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2729 // Final rounding and shift
2730 in[0] = _mm_adds_epi16(in[0], final_rounding);
2731 in[1] = _mm_adds_epi16(in[1], final_rounding);
2732 in[2] = _mm_adds_epi16(in[2], final_rounding);
2733 in[3] = _mm_adds_epi16(in[3], final_rounding);
2734 in[4] = _mm_adds_epi16(in[4], final_rounding);
2735 in[5] = _mm_adds_epi16(in[5], final_rounding);
2736 in[6] = _mm_adds_epi16(in[6], final_rounding);
2737 in[7] = _mm_adds_epi16(in[7], final_rounding);
2738 in[8] = _mm_adds_epi16(in[8], final_rounding);
2739 in[9] = _mm_adds_epi16(in[9], final_rounding);
2740 in[10] = _mm_adds_epi16(in[10], final_rounding);
2741 in[11] = _mm_adds_epi16(in[11], final_rounding);
2742 in[12] = _mm_adds_epi16(in[12], final_rounding);
2743 in[13] = _mm_adds_epi16(in[13], final_rounding);
2744 in[14] = _mm_adds_epi16(in[14], final_rounding);
2745 in[15] = _mm_adds_epi16(in[15], final_rounding);
2747 in[0] = _mm_srai_epi16(in[0], 6);
2748 in[1] = _mm_srai_epi16(in[1], 6);
2749 in[2] = _mm_srai_epi16(in[2], 6);
2750 in[3] = _mm_srai_epi16(in[3], 6);
2751 in[4] = _mm_srai_epi16(in[4], 6);
2752 in[5] = _mm_srai_epi16(in[5], 6);
2753 in[6] = _mm_srai_epi16(in[6], 6);
2754 in[7] = _mm_srai_epi16(in[7], 6);
2755 in[8] = _mm_srai_epi16(in[8], 6);
2756 in[9] = _mm_srai_epi16(in[9], 6);
2757 in[10] = _mm_srai_epi16(in[10], 6);
2758 in[11] = _mm_srai_epi16(in[11], 6);
2759 in[12] = _mm_srai_epi16(in[12], 6);
2760 in[13] = _mm_srai_epi16(in[13], 6);
2761 in[14] = _mm_srai_epi16(in[14], 6);
2762 in[15] = _mm_srai_epi16(in[15], 6);
2764 RECON_AND_STORE(dest, in[0]);
2765 RECON_AND_STORE(dest, in[1]);
2766 RECON_AND_STORE(dest, in[2]);
2767 RECON_AND_STORE(dest, in[3]);
2768 RECON_AND_STORE(dest, in[4]);
2769 RECON_AND_STORE(dest, in[5]);
2770 RECON_AND_STORE(dest, in[6]);
2771 RECON_AND_STORE(dest, in[7]);
2772 RECON_AND_STORE(dest, in[8]);
2773 RECON_AND_STORE(dest, in[9]);
2774 RECON_AND_STORE(dest, in[10]);
2775 RECON_AND_STORE(dest, in[11]);
2776 RECON_AND_STORE(dest, in[12]);
2777 RECON_AND_STORE(dest, in[13]);
2778 RECON_AND_STORE(dest, in[14]);
2779 RECON_AND_STORE(dest, in[15]);
2781 dest += 8 - (stride * 16);
2785 #define LOAD_DQCOEFF(reg, input) \
2787 reg = _mm_load_si128((const __m128i *) input); \
2791 #define IDCT32_1D_34 \
2794 const __m128i zero = _mm_setzero_si128();\
2795 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2796 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2798 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2804 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2805 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2807 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2808 stg1_1, stp1_16, stp1_31); \
2809 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2810 stg1_7, stp1_19, stp1_28); \
2811 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2812 stg1_9, stp1_20, stp1_27); \
2813 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2814 stg1_15, stp1_23, stp1_24); \
2819 const __m128i zero = _mm_setzero_si128();\
2820 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2821 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2823 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2824 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2826 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2827 stg2_1, stp2_8, stp2_15); \
2828 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2829 stg2_7, stp2_11, stp2_12); \
2831 stp2_16 = stp1_16; \
2832 stp2_19 = stp1_19; \
2834 stp2_20 = stp1_20; \
2835 stp2_23 = stp1_23; \
2837 stp2_24 = stp1_24; \
2838 stp2_27 = stp1_27; \
2840 stp2_28 = stp1_28; \
2841 stp2_31 = stp1_31; \
2846 const __m128i zero = _mm_setzero_si128();\
2847 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2848 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2850 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2851 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2852 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2853 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2855 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2856 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2857 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2858 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2860 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2861 stg3_1, stp1_4, stp1_7); \
2864 stp1_11 = stp2_11; \
2865 stp1_12 = stp2_12; \
2866 stp1_15 = stp2_15; \
2868 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2869 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2871 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2872 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2875 stp1_16 = stp2_16; \
2876 stp1_31 = stp2_31; \
2877 stp1_19 = stp2_19; \
2878 stp1_20 = stp2_20; \
2879 stp1_23 = stp2_23; \
2880 stp1_24 = stp2_24; \
2881 stp1_27 = stp2_27; \
2882 stp1_28 = stp2_28; \
2887 const __m128i zero = _mm_setzero_si128();\
2888 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2889 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2891 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2892 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2893 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2894 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2896 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2897 stg4_1, stp2_0, stp2_1); \
2904 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2905 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2909 stp2_15 = stp1_15; \
2910 stp2_11 = stp1_11; \
2911 stp2_12 = stp1_12; \
2913 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2914 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2915 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2916 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2917 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2918 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2919 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2920 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2922 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2923 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2924 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2925 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2926 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2927 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2928 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2929 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2934 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2935 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2936 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2937 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2939 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2940 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2941 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2942 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2944 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2945 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2952 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2953 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2954 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2955 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2957 tmp0 = _mm_add_epi32(tmp0, rounding); \
2958 tmp1 = _mm_add_epi32(tmp1, rounding); \
2959 tmp2 = _mm_add_epi32(tmp2, rounding); \
2960 tmp3 = _mm_add_epi32(tmp3, rounding); \
2962 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2963 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2964 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2965 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2967 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2968 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2973 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2974 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2975 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2976 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2977 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2978 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2979 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2980 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2982 stp1_16 = stp2_16; \
2983 stp1_17 = stp2_17; \
2985 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2986 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2988 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2989 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2992 stp1_22 = stp2_22; \
2993 stp1_23 = stp2_23; \
2994 stp1_24 = stp2_24; \
2995 stp1_25 = stp2_25; \
2996 stp1_30 = stp2_30; \
2997 stp1_31 = stp2_31; \
3002 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3003 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3004 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3005 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3007 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3008 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3009 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3010 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3011 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3012 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3013 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3014 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3018 stp2_14 = stp1_14; \
3019 stp2_15 = stp1_15; \
3021 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3022 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3023 stp2_13, stp2_11, stp2_12) \
3025 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3026 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3027 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3028 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3029 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3030 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3031 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3032 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3034 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3035 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3036 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3037 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3038 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3039 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3040 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3041 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3046 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3047 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3048 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3049 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3051 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3052 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3053 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3054 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3056 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3057 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3058 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3059 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3060 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3061 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3062 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3063 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3064 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3065 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3066 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3067 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3068 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3069 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3070 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3071 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3073 stp1_16 = stp2_16; \
3074 stp1_17 = stp2_17; \
3075 stp1_18 = stp2_18; \
3076 stp1_19 = stp2_19; \
3078 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3079 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3081 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3082 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3085 stp1_28 = stp2_28; \
3086 stp1_29 = stp2_29; \
3087 stp1_30 = stp2_30; \
3088 stp1_31 = stp2_31; \
3095 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
3096 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
3097 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
3098 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
3100 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
3101 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
3102 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
3103 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
3105 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
3106 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
3107 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
3108 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
3110 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
3111 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
3112 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
3113 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
3115 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
3116 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
3118 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
3119 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
3121 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
3122 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
3124 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
3125 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
3131 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
3132 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
3133 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
3134 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
3136 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
3137 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3138 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3139 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
3141 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
3142 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
3144 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
3145 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
3148 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
3149 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
3150 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
3151 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
3153 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
3154 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
3155 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
3156 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
3158 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
3159 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
3160 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
3161 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
3163 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
3164 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
3165 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
3166 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
3171 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3172 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3173 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3174 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
3176 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
3177 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
3178 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3179 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3181 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3182 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3183 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3184 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3186 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
3187 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
3190 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
3191 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
3192 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
3193 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
3194 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
3195 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
3196 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
3197 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
3199 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
3200 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
3202 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
3203 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
3206 stp1_16 = stp2_16; \
3207 stp1_31 = stp2_31; \
3208 stp1_19 = stp2_19; \
3209 stp1_20 = stp2_20; \
3210 stp1_23 = stp2_23; \
3211 stp1_24 = stp2_24; \
3212 stp1_27 = stp2_27; \
3213 stp1_28 = stp2_28; \
3218 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3219 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3220 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3221 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
3223 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
3224 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
3225 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3226 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3228 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
3229 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
3232 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
3233 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
3234 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
3235 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
3237 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
3238 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
3242 stp2_15 = stp1_15; \
3243 stp2_11 = stp1_11; \
3244 stp2_12 = stp1_12; \
3246 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
3247 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
3248 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
3249 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
3250 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
3251 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
3252 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
3253 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
3255 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
3256 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
3257 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
3258 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
3259 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
3260 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
3261 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
3262 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
3267 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
3268 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
3269 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3270 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3272 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
3273 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
3274 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3275 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3277 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3278 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3280 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
3281 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
3282 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
3283 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
3285 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
3286 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
3287 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
3288 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
3290 tmp0 = _mm_add_epi32(tmp0, rounding); \
3291 tmp1 = _mm_add_epi32(tmp1, rounding); \
3292 tmp2 = _mm_add_epi32(tmp2, rounding); \
3293 tmp3 = _mm_add_epi32(tmp3, rounding); \
3295 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
3296 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3297 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3298 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3300 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3301 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3306 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3307 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3308 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3309 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3310 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3311 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3312 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3313 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3315 stp1_16 = stp2_16; \
3316 stp1_17 = stp2_17; \
3318 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3319 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3321 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3322 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3325 stp1_22 = stp2_22; \
3326 stp1_23 = stp2_23; \
3327 stp1_24 = stp2_24; \
3328 stp1_25 = stp2_25; \
3329 stp1_30 = stp2_30; \
3330 stp1_31 = stp2_31; \
3335 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3336 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3337 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3338 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3340 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3341 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3342 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3343 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3344 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3345 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3346 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3347 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3351 stp2_14 = stp1_14; \
3352 stp2_15 = stp1_15; \
3354 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3355 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3356 stp2_13, stp2_11, stp2_12) \
3358 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3359 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3360 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3361 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3362 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3363 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3364 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3365 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3367 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3368 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3369 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3370 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3371 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3372 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3373 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3374 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3379 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3380 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3381 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3382 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3384 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3385 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3386 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3387 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3389 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3390 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3391 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3392 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3393 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3394 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3395 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3396 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3397 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3398 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3399 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3400 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3401 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3402 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3403 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3404 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3406 stp1_16 = stp2_16; \
3407 stp1_17 = stp2_17; \
3408 stp1_18 = stp2_18; \
3409 stp1_19 = stp2_19; \
3411 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3412 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3414 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3415 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3418 stp1_28 = stp2_28; \
3419 stp1_29 = stp2_29; \
3420 stp1_30 = stp2_30; \
3421 stp1_31 = stp2_31; \
3424 // Only upper-left 8x8 has non-zero coeff
3425 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3427 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3428 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3430 // idct constants for each stage
3431 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3432 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3433 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3434 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3435 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3436 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3437 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3438 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3439 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3440 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3441 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3442 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3443 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3444 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3445 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3446 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3448 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3449 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3450 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3451 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3452 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3453 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3454 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3455 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3457 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3458 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3459 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3460 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3461 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3462 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3463 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3464 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3465 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3466 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3468 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3469 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3470 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3471 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3472 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3473 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3474 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3476 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3478 __m128i in[32], col[32];
3479 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3480 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3481 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3482 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3484 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3485 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3486 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3487 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3489 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3492 LOAD_DQCOEFF(in[0], input);
3493 LOAD_DQCOEFF(in[8], input);
3494 LOAD_DQCOEFF(in[16], input);
3495 LOAD_DQCOEFF(in[24], input);
3496 LOAD_DQCOEFF(in[1], input);
3497 LOAD_DQCOEFF(in[9], input);
3498 LOAD_DQCOEFF(in[17], input);
3499 LOAD_DQCOEFF(in[25], input);
3500 LOAD_DQCOEFF(in[2], input);
3501 LOAD_DQCOEFF(in[10], input);
3502 LOAD_DQCOEFF(in[18], input);
3503 LOAD_DQCOEFF(in[26], input);
3504 LOAD_DQCOEFF(in[3], input);
3505 LOAD_DQCOEFF(in[11], input);
3506 LOAD_DQCOEFF(in[19], input);
3507 LOAD_DQCOEFF(in[27], input);
3509 LOAD_DQCOEFF(in[4], input);
3510 LOAD_DQCOEFF(in[12], input);
3511 LOAD_DQCOEFF(in[20], input);
3512 LOAD_DQCOEFF(in[28], input);
3513 LOAD_DQCOEFF(in[5], input);
3514 LOAD_DQCOEFF(in[13], input);
3515 LOAD_DQCOEFF(in[21], input);
3516 LOAD_DQCOEFF(in[29], input);
3517 LOAD_DQCOEFF(in[6], input);
3518 LOAD_DQCOEFF(in[14], input);
3519 LOAD_DQCOEFF(in[22], input);
3520 LOAD_DQCOEFF(in[30], input);
3521 LOAD_DQCOEFF(in[7], input);
3522 LOAD_DQCOEFF(in[15], input);
3523 LOAD_DQCOEFF(in[23], input);
3524 LOAD_DQCOEFF(in[31], input);
3526 array_transpose_8x8(in, in);
3527 array_transpose_8x8(in+8, in+8);
3528 array_transpose_8x8(in+16, in+16);
3529 array_transpose_8x8(in+24, in+24);
3533 // 1_D: Store 32 intermediate results for each 8x32 block.
3534 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3535 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3536 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3537 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3538 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3539 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3540 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3541 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3542 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3543 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3544 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3545 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3546 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3547 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3548 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3549 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3550 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3551 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3552 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3553 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3554 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3555 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3556 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3557 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3558 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3559 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3560 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3561 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3562 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3563 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3564 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3565 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3566 for (i = 0; i < 4; i++) {
3567 const __m128i zero = _mm_setzero_si128();
3568 // Transpose 32x8 block to 8x32 block
3569 array_transpose_8x8(col+i*8, in);
3572 // 2_D: Calculate the results and store them to destination.
3573 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3574 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3575 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3576 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3577 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3578 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3579 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3580 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3581 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3582 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3583 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3584 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3585 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3586 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3587 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3588 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3589 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3590 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3591 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3592 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3593 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3594 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3595 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3596 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3597 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3598 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3599 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3600 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3601 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3602 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3603 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3604 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3606 // Final rounding and shift
3607 in[0] = _mm_adds_epi16(in[0], final_rounding);
3608 in[1] = _mm_adds_epi16(in[1], final_rounding);
3609 in[2] = _mm_adds_epi16(in[2], final_rounding);
3610 in[3] = _mm_adds_epi16(in[3], final_rounding);
3611 in[4] = _mm_adds_epi16(in[4], final_rounding);
3612 in[5] = _mm_adds_epi16(in[5], final_rounding);
3613 in[6] = _mm_adds_epi16(in[6], final_rounding);
3614 in[7] = _mm_adds_epi16(in[7], final_rounding);
3615 in[8] = _mm_adds_epi16(in[8], final_rounding);
3616 in[9] = _mm_adds_epi16(in[9], final_rounding);
3617 in[10] = _mm_adds_epi16(in[10], final_rounding);
3618 in[11] = _mm_adds_epi16(in[11], final_rounding);
3619 in[12] = _mm_adds_epi16(in[12], final_rounding);
3620 in[13] = _mm_adds_epi16(in[13], final_rounding);
3621 in[14] = _mm_adds_epi16(in[14], final_rounding);
3622 in[15] = _mm_adds_epi16(in[15], final_rounding);
3623 in[16] = _mm_adds_epi16(in[16], final_rounding);
3624 in[17] = _mm_adds_epi16(in[17], final_rounding);
3625 in[18] = _mm_adds_epi16(in[18], final_rounding);
3626 in[19] = _mm_adds_epi16(in[19], final_rounding);
3627 in[20] = _mm_adds_epi16(in[20], final_rounding);
3628 in[21] = _mm_adds_epi16(in[21], final_rounding);
3629 in[22] = _mm_adds_epi16(in[22], final_rounding);
3630 in[23] = _mm_adds_epi16(in[23], final_rounding);
3631 in[24] = _mm_adds_epi16(in[24], final_rounding);
3632 in[25] = _mm_adds_epi16(in[25], final_rounding);
3633 in[26] = _mm_adds_epi16(in[26], final_rounding);
3634 in[27] = _mm_adds_epi16(in[27], final_rounding);
3635 in[28] = _mm_adds_epi16(in[28], final_rounding);
3636 in[29] = _mm_adds_epi16(in[29], final_rounding);
3637 in[30] = _mm_adds_epi16(in[30], final_rounding);
3638 in[31] = _mm_adds_epi16(in[31], final_rounding);
3640 in[0] = _mm_srai_epi16(in[0], 6);
3641 in[1] = _mm_srai_epi16(in[1], 6);
3642 in[2] = _mm_srai_epi16(in[2], 6);
3643 in[3] = _mm_srai_epi16(in[3], 6);
3644 in[4] = _mm_srai_epi16(in[4], 6);
3645 in[5] = _mm_srai_epi16(in[5], 6);
3646 in[6] = _mm_srai_epi16(in[6], 6);
3647 in[7] = _mm_srai_epi16(in[7], 6);
3648 in[8] = _mm_srai_epi16(in[8], 6);
3649 in[9] = _mm_srai_epi16(in[9], 6);
3650 in[10] = _mm_srai_epi16(in[10], 6);
3651 in[11] = _mm_srai_epi16(in[11], 6);
3652 in[12] = _mm_srai_epi16(in[12], 6);
3653 in[13] = _mm_srai_epi16(in[13], 6);
3654 in[14] = _mm_srai_epi16(in[14], 6);
3655 in[15] = _mm_srai_epi16(in[15], 6);
3656 in[16] = _mm_srai_epi16(in[16], 6);
3657 in[17] = _mm_srai_epi16(in[17], 6);
3658 in[18] = _mm_srai_epi16(in[18], 6);
3659 in[19] = _mm_srai_epi16(in[19], 6);
3660 in[20] = _mm_srai_epi16(in[20], 6);
3661 in[21] = _mm_srai_epi16(in[21], 6);
3662 in[22] = _mm_srai_epi16(in[22], 6);
3663 in[23] = _mm_srai_epi16(in[23], 6);
3664 in[24] = _mm_srai_epi16(in[24], 6);
3665 in[25] = _mm_srai_epi16(in[25], 6);
3666 in[26] = _mm_srai_epi16(in[26], 6);
3667 in[27] = _mm_srai_epi16(in[27], 6);
3668 in[28] = _mm_srai_epi16(in[28], 6);
3669 in[29] = _mm_srai_epi16(in[29], 6);
3670 in[30] = _mm_srai_epi16(in[30], 6);
3671 in[31] = _mm_srai_epi16(in[31], 6);
3673 RECON_AND_STORE(dest, in[0]);
3674 RECON_AND_STORE(dest, in[1]);
3675 RECON_AND_STORE(dest, in[2]);
3676 RECON_AND_STORE(dest, in[3]);
3677 RECON_AND_STORE(dest, in[4]);
3678 RECON_AND_STORE(dest, in[5]);
3679 RECON_AND_STORE(dest, in[6]);
3680 RECON_AND_STORE(dest, in[7]);
3681 RECON_AND_STORE(dest, in[8]);
3682 RECON_AND_STORE(dest, in[9]);
3683 RECON_AND_STORE(dest, in[10]);
3684 RECON_AND_STORE(dest, in[11]);
3685 RECON_AND_STORE(dest, in[12]);
3686 RECON_AND_STORE(dest, in[13]);
3687 RECON_AND_STORE(dest, in[14]);
3688 RECON_AND_STORE(dest, in[15]);
3689 RECON_AND_STORE(dest, in[16]);
3690 RECON_AND_STORE(dest, in[17]);
3691 RECON_AND_STORE(dest, in[18]);
3692 RECON_AND_STORE(dest, in[19]);
3693 RECON_AND_STORE(dest, in[20]);
3694 RECON_AND_STORE(dest, in[21]);
3695 RECON_AND_STORE(dest, in[22]);
3696 RECON_AND_STORE(dest, in[23]);
3697 RECON_AND_STORE(dest, in[24]);
3698 RECON_AND_STORE(dest, in[25]);
3699 RECON_AND_STORE(dest, in[26]);
3700 RECON_AND_STORE(dest, in[27]);
3701 RECON_AND_STORE(dest, in[28]);
3702 RECON_AND_STORE(dest, in[29]);
3703 RECON_AND_STORE(dest, in[30]);
3704 RECON_AND_STORE(dest, in[31]);
3706 dest += 8 - (stride * 32);
3710 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3712 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3713 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3715 // idct constants for each stage
3716 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3717 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3718 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3719 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3720 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3721 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3722 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3723 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3724 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3725 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3726 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3727 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3728 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3729 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3730 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3731 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3733 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3734 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3735 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3736 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3737 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3738 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3739 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3740 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3742 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3743 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3744 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3745 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3746 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3747 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3748 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3749 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3750 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3751 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3753 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3754 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3755 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3756 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3757 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3758 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3759 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3761 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3763 __m128i in[32], col[128], zero_idx[16];
3764 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3765 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3766 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3767 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3769 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3770 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3771 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3772 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3774 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3778 for (i = 0; i < 4; i++) {
3782 LOAD_DQCOEFF(in[0], input);
3783 LOAD_DQCOEFF(in[8], input);
3784 LOAD_DQCOEFF(in[16], input);
3785 LOAD_DQCOEFF(in[24], input);
3786 LOAD_DQCOEFF(in[1], input);
3787 LOAD_DQCOEFF(in[9], input);
3788 LOAD_DQCOEFF(in[17], input);
3789 LOAD_DQCOEFF(in[25], input);
3790 LOAD_DQCOEFF(in[2], input);
3791 LOAD_DQCOEFF(in[10], input);
3792 LOAD_DQCOEFF(in[18], input);
3793 LOAD_DQCOEFF(in[26], input);
3794 LOAD_DQCOEFF(in[3], input);
3795 LOAD_DQCOEFF(in[11], input);
3796 LOAD_DQCOEFF(in[19], input);
3797 LOAD_DQCOEFF(in[27], input);
3799 LOAD_DQCOEFF(in[4], input);
3800 LOAD_DQCOEFF(in[12], input);
3801 LOAD_DQCOEFF(in[20], input);
3802 LOAD_DQCOEFF(in[28], input);
3803 LOAD_DQCOEFF(in[5], input);
3804 LOAD_DQCOEFF(in[13], input);
3805 LOAD_DQCOEFF(in[21], input);
3806 LOAD_DQCOEFF(in[29], input);
3807 LOAD_DQCOEFF(in[6], input);
3808 LOAD_DQCOEFF(in[14], input);
3809 LOAD_DQCOEFF(in[22], input);
3810 LOAD_DQCOEFF(in[30], input);
3811 LOAD_DQCOEFF(in[7], input);
3812 LOAD_DQCOEFF(in[15], input);
3813 LOAD_DQCOEFF(in[23], input);
3814 LOAD_DQCOEFF(in[31], input);
3816 // checking if all entries are zero
3817 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3818 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3819 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3820 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3821 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3822 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3823 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3824 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3825 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3826 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3827 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3828 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3829 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3830 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3831 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3832 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3834 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3835 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3836 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3837 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3838 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3839 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3840 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3841 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3843 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3844 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3845 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3846 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3847 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3848 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3849 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3851 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3852 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3853 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3854 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3855 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3857 if (!zero_flag[0] && !zero_flag[1]) {
3858 col[i32 + 0] = _mm_setzero_si128();
3859 col[i32 + 1] = _mm_setzero_si128();
3860 col[i32 + 2] = _mm_setzero_si128();
3861 col[i32 + 3] = _mm_setzero_si128();
3862 col[i32 + 4] = _mm_setzero_si128();
3863 col[i32 + 5] = _mm_setzero_si128();
3864 col[i32 + 6] = _mm_setzero_si128();
3865 col[i32 + 7] = _mm_setzero_si128();
3866 col[i32 + 8] = _mm_setzero_si128();
3867 col[i32 + 9] = _mm_setzero_si128();
3868 col[i32 + 10] = _mm_setzero_si128();
3869 col[i32 + 11] = _mm_setzero_si128();
3870 col[i32 + 12] = _mm_setzero_si128();
3871 col[i32 + 13] = _mm_setzero_si128();
3872 col[i32 + 14] = _mm_setzero_si128();
3873 col[i32 + 15] = _mm_setzero_si128();
3874 col[i32 + 16] = _mm_setzero_si128();
3875 col[i32 + 17] = _mm_setzero_si128();
3876 col[i32 + 18] = _mm_setzero_si128();
3877 col[i32 + 19] = _mm_setzero_si128();
3878 col[i32 + 20] = _mm_setzero_si128();
3879 col[i32 + 21] = _mm_setzero_si128();
3880 col[i32 + 22] = _mm_setzero_si128();
3881 col[i32 + 23] = _mm_setzero_si128();
3882 col[i32 + 24] = _mm_setzero_si128();
3883 col[i32 + 25] = _mm_setzero_si128();
3884 col[i32 + 26] = _mm_setzero_si128();
3885 col[i32 + 27] = _mm_setzero_si128();
3886 col[i32 + 28] = _mm_setzero_si128();
3887 col[i32 + 29] = _mm_setzero_si128();
3888 col[i32 + 30] = _mm_setzero_si128();
3889 col[i32 + 31] = _mm_setzero_si128();
3893 // Transpose 32x8 block to 8x32 block
3894 array_transpose_8x8(in, in);
3895 array_transpose_8x8(in+8, in+8);
3896 array_transpose_8x8(in+16, in+16);
3897 array_transpose_8x8(in+24, in+24);
3901 // 1_D: Store 32 intermediate results for each 8x32 block.
3902 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3903 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3904 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3905 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3906 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3907 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3908 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3909 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3910 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3911 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3912 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3913 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3914 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3915 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3916 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3917 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3918 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3919 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3920 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3921 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3922 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3923 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3924 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3925 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3926 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3927 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3928 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3929 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3930 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3931 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3932 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3933 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3935 for (i = 0; i < 4; i++) {
3936 const __m128i zero = _mm_setzero_si128();
3940 // Transpose 32x8 block to 8x32 block
3941 array_transpose_8x8(col+j, in);
3942 array_transpose_8x8(col+j+32, in+8);
3943 array_transpose_8x8(col+j+64, in+16);
3944 array_transpose_8x8(col+j+96, in+24);
3948 // 2_D: Calculate the results and store them to destination.
3949 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3950 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3951 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3952 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3953 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3954 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3955 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3956 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3957 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3958 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3959 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3960 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3961 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3962 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3963 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3964 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3965 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3966 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3967 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3968 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3969 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3970 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3971 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3972 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3973 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3974 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3975 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3976 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3977 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3978 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3979 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3980 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3982 // Final rounding and shift
3983 in[0] = _mm_adds_epi16(in[0], final_rounding);
3984 in[1] = _mm_adds_epi16(in[1], final_rounding);
3985 in[2] = _mm_adds_epi16(in[2], final_rounding);
3986 in[3] = _mm_adds_epi16(in[3], final_rounding);
3987 in[4] = _mm_adds_epi16(in[4], final_rounding);
3988 in[5] = _mm_adds_epi16(in[5], final_rounding);
3989 in[6] = _mm_adds_epi16(in[6], final_rounding);
3990 in[7] = _mm_adds_epi16(in[7], final_rounding);
3991 in[8] = _mm_adds_epi16(in[8], final_rounding);
3992 in[9] = _mm_adds_epi16(in[9], final_rounding);
3993 in[10] = _mm_adds_epi16(in[10], final_rounding);
3994 in[11] = _mm_adds_epi16(in[11], final_rounding);
3995 in[12] = _mm_adds_epi16(in[12], final_rounding);
3996 in[13] = _mm_adds_epi16(in[13], final_rounding);
3997 in[14] = _mm_adds_epi16(in[14], final_rounding);
3998 in[15] = _mm_adds_epi16(in[15], final_rounding);
3999 in[16] = _mm_adds_epi16(in[16], final_rounding);
4000 in[17] = _mm_adds_epi16(in[17], final_rounding);
4001 in[18] = _mm_adds_epi16(in[18], final_rounding);
4002 in[19] = _mm_adds_epi16(in[19], final_rounding);
4003 in[20] = _mm_adds_epi16(in[20], final_rounding);
4004 in[21] = _mm_adds_epi16(in[21], final_rounding);
4005 in[22] = _mm_adds_epi16(in[22], final_rounding);
4006 in[23] = _mm_adds_epi16(in[23], final_rounding);
4007 in[24] = _mm_adds_epi16(in[24], final_rounding);
4008 in[25] = _mm_adds_epi16(in[25], final_rounding);
4009 in[26] = _mm_adds_epi16(in[26], final_rounding);
4010 in[27] = _mm_adds_epi16(in[27], final_rounding);
4011 in[28] = _mm_adds_epi16(in[28], final_rounding);
4012 in[29] = _mm_adds_epi16(in[29], final_rounding);
4013 in[30] = _mm_adds_epi16(in[30], final_rounding);
4014 in[31] = _mm_adds_epi16(in[31], final_rounding);
4016 in[0] = _mm_srai_epi16(in[0], 6);
4017 in[1] = _mm_srai_epi16(in[1], 6);
4018 in[2] = _mm_srai_epi16(in[2], 6);
4019 in[3] = _mm_srai_epi16(in[3], 6);
4020 in[4] = _mm_srai_epi16(in[4], 6);
4021 in[5] = _mm_srai_epi16(in[5], 6);
4022 in[6] = _mm_srai_epi16(in[6], 6);
4023 in[7] = _mm_srai_epi16(in[7], 6);
4024 in[8] = _mm_srai_epi16(in[8], 6);
4025 in[9] = _mm_srai_epi16(in[9], 6);
4026 in[10] = _mm_srai_epi16(in[10], 6);
4027 in[11] = _mm_srai_epi16(in[11], 6);
4028 in[12] = _mm_srai_epi16(in[12], 6);
4029 in[13] = _mm_srai_epi16(in[13], 6);
4030 in[14] = _mm_srai_epi16(in[14], 6);
4031 in[15] = _mm_srai_epi16(in[15], 6);
4032 in[16] = _mm_srai_epi16(in[16], 6);
4033 in[17] = _mm_srai_epi16(in[17], 6);
4034 in[18] = _mm_srai_epi16(in[18], 6);
4035 in[19] = _mm_srai_epi16(in[19], 6);
4036 in[20] = _mm_srai_epi16(in[20], 6);
4037 in[21] = _mm_srai_epi16(in[21], 6);
4038 in[22] = _mm_srai_epi16(in[22], 6);
4039 in[23] = _mm_srai_epi16(in[23], 6);
4040 in[24] = _mm_srai_epi16(in[24], 6);
4041 in[25] = _mm_srai_epi16(in[25], 6);
4042 in[26] = _mm_srai_epi16(in[26], 6);
4043 in[27] = _mm_srai_epi16(in[27], 6);
4044 in[28] = _mm_srai_epi16(in[28], 6);
4045 in[29] = _mm_srai_epi16(in[29], 6);
4046 in[30] = _mm_srai_epi16(in[30], 6);
4047 in[31] = _mm_srai_epi16(in[31], 6);
4049 RECON_AND_STORE(dest, in[0]);
4050 RECON_AND_STORE(dest, in[1]);
4051 RECON_AND_STORE(dest, in[2]);
4052 RECON_AND_STORE(dest, in[3]);
4053 RECON_AND_STORE(dest, in[4]);
4054 RECON_AND_STORE(dest, in[5]);
4055 RECON_AND_STORE(dest, in[6]);
4056 RECON_AND_STORE(dest, in[7]);
4057 RECON_AND_STORE(dest, in[8]);
4058 RECON_AND_STORE(dest, in[9]);
4059 RECON_AND_STORE(dest, in[10]);
4060 RECON_AND_STORE(dest, in[11]);
4061 RECON_AND_STORE(dest, in[12]);
4062 RECON_AND_STORE(dest, in[13]);
4063 RECON_AND_STORE(dest, in[14]);
4064 RECON_AND_STORE(dest, in[15]);
4065 RECON_AND_STORE(dest, in[16]);
4066 RECON_AND_STORE(dest, in[17]);
4067 RECON_AND_STORE(dest, in[18]);
4068 RECON_AND_STORE(dest, in[19]);
4069 RECON_AND_STORE(dest, in[20]);
4070 RECON_AND_STORE(dest, in[21]);
4071 RECON_AND_STORE(dest, in[22]);
4072 RECON_AND_STORE(dest, in[23]);
4073 RECON_AND_STORE(dest, in[24]);
4074 RECON_AND_STORE(dest, in[25]);
4075 RECON_AND_STORE(dest, in[26]);
4076 RECON_AND_STORE(dest, in[27]);
4077 RECON_AND_STORE(dest, in[28]);
4078 RECON_AND_STORE(dest, in[29]);
4079 RECON_AND_STORE(dest, in[30]);
4080 RECON_AND_STORE(dest, in[31]);
4082 dest += 8 - (stride * 32);
4086 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
4088 const __m128i zero = _mm_setzero_si128();
4091 a = dct_const_round_shift(input[0] * cospi_16_64);
4092 a = dct_const_round_shift(a * cospi_16_64);
4093 a = ROUND_POWER_OF_TWO(a, 6);
4095 dc_value = _mm_set1_epi16(a);
4097 for (i = 0; i < 4; ++i) {
4098 RECON_AND_STORE(dest, dc_value);
4099 RECON_AND_STORE(dest, dc_value);
4100 RECON_AND_STORE(dest, dc_value);
4101 RECON_AND_STORE(dest, dc_value);
4102 RECON_AND_STORE(dest, dc_value);
4103 RECON_AND_STORE(dest, dc_value);
4104 RECON_AND_STORE(dest, dc_value);
4105 RECON_AND_STORE(dest, dc_value);
4106 RECON_AND_STORE(dest, dc_value);
4107 RECON_AND_STORE(dest, dc_value);
4108 RECON_AND_STORE(dest, dc_value);
4109 RECON_AND_STORE(dest, dc_value);
4110 RECON_AND_STORE(dest, dc_value);
4111 RECON_AND_STORE(dest, dc_value);
4112 RECON_AND_STORE(dest, dc_value);
4113 RECON_AND_STORE(dest, dc_value);
4114 RECON_AND_STORE(dest, dc_value);
4115 RECON_AND_STORE(dest, dc_value);
4116 RECON_AND_STORE(dest, dc_value);
4117 RECON_AND_STORE(dest, dc_value);
4118 RECON_AND_STORE(dest, dc_value);
4119 RECON_AND_STORE(dest, dc_value);
4120 RECON_AND_STORE(dest, dc_value);
4121 RECON_AND_STORE(dest, dc_value);
4122 RECON_AND_STORE(dest, dc_value);
4123 RECON_AND_STORE(dest, dc_value);
4124 RECON_AND_STORE(dest, dc_value);
4125 RECON_AND_STORE(dest, dc_value);
4126 RECON_AND_STORE(dest, dc_value);
4127 RECON_AND_STORE(dest, dc_value);
4128 RECON_AND_STORE(dest, dc_value);
4129 RECON_AND_STORE(dest, dc_value);
4130 dest += 8 - (stride * 32);