2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vp9_rtcd.h"
14 #include "vp9/common/vp9_systemdependent.h"
15 #include "vp9/common/vp9_blockd.h"
16 #include "vp9/common/vp9_idct.h"
18 #if CONFIG_EMULATE_HARDWARE
19 // When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
20 // non-normative method to handle overflows. A stream that causes
21 // overflows in the inverse transform is considered invalid in VP9,
22 // and a hardware implementer is free to choose any reasonable
23 // method to handle overflows. However to aid in hardware
24 // verification they can use a specific implementation of the
25 // WRAPLOW() macro below that is identical to their intended
26 // hardware implementation (and also use configure options to trigger
27 // the C-implementation of the transform).
29 // The particular WRAPLOW implementation below performs strict
30 // overflow wrapping to match common hardware implementations.
31 // bd of 8 uses trans_low with 16bits, need to remove 16bits
32 // bd of 10 uses trans_low with 18bits, need to remove 14bits
33 // bd of 12 uses trans_low with 20bits, need to remove 12bits
34 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
35 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
37 #define WRAPLOW(x, bd) (x)
38 #endif // CONFIG_EMULATE_HARDWARE
40 #if CONFIG_VP9_HIGHBITDEPTH
41 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
43 trans = WRAPLOW(trans, bd);
44 return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
46 #endif // CONFIG_VP9_HIGHBITDEPTH
48 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
49 trans = WRAPLOW(trans, 8);
50 return clip_pixel(WRAPLOW(dest + trans, 8));
53 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
54 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
55 0.5 shifts per pixel. */
57 tran_low_t output[16];
58 tran_high_t a1, b1, c1, d1, e1;
59 const tran_low_t *ip = input;
60 tran_low_t *op = output;
62 for (i = 0; i < 4; i++) {
63 a1 = ip[0] >> UNIT_QUANT_SHIFT;
64 c1 = ip[1] >> UNIT_QUANT_SHIFT;
65 d1 = ip[2] >> UNIT_QUANT_SHIFT;
66 b1 = ip[3] >> UNIT_QUANT_SHIFT;
74 op[0] = WRAPLOW(a1, 8);
75 op[1] = WRAPLOW(b1, 8);
76 op[2] = WRAPLOW(c1, 8);
77 op[3] = WRAPLOW(d1, 8);
83 for (i = 0; i < 4; i++) {
95 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
96 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
97 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
98 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
105 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
109 const tran_low_t *ip = in;
110 tran_low_t *op = tmp;
112 a1 = ip[0] >> UNIT_QUANT_SHIFT;
115 op[0] = WRAPLOW(a1, 8);
116 op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
119 for (i = 0; i < 4; i++) {
122 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
123 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
124 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
125 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
131 static void idct4(const tran_low_t *input, tran_low_t *output) {
133 tran_high_t temp1, temp2;
135 temp1 = (input[0] + input[2]) * cospi_16_64;
136 temp2 = (input[0] - input[2]) * cospi_16_64;
137 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
138 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
139 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
140 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
141 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
142 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
145 output[0] = WRAPLOW(step[0] + step[3], 8);
146 output[1] = WRAPLOW(step[1] + step[2], 8);
147 output[2] = WRAPLOW(step[1] - step[2], 8);
148 output[3] = WRAPLOW(step[0] - step[3], 8);
151 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
152 tran_low_t out[4 * 4];
153 tran_low_t *outptr = out;
155 tran_low_t temp_in[4], temp_out[4];
158 for (i = 0; i < 4; ++i) {
159 idct4(input, outptr);
165 for (i = 0; i < 4; ++i) {
166 for (j = 0; j < 4; ++j)
167 temp_in[j] = out[j * 4 + i];
168 idct4(temp_in, temp_out);
169 for (j = 0; j < 4; ++j) {
170 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
171 ROUND_POWER_OF_TWO(temp_out[j], 4));
176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
181 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
182 a1 = ROUND_POWER_OF_TWO(out, 4);
184 for (i = 0; i < 4; i++) {
185 dest[0] = clip_pixel_add(dest[0], a1);
186 dest[1] = clip_pixel_add(dest[1], a1);
187 dest[2] = clip_pixel_add(dest[2], a1);
188 dest[3] = clip_pixel_add(dest[3], a1);
193 static void idct8(const tran_low_t *input, tran_low_t *output) {
194 tran_low_t step1[8], step2[8];
195 tran_high_t temp1, temp2;
201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
203 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
204 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
207 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
208 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
210 // stage 2 & stage 3 - even half
213 // stage 2 - odd half
214 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
215 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
216 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
217 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
221 temp1 = (step2[6] - step2[5]) * cospi_16_64;
222 temp2 = (step2[5] + step2[6]) * cospi_16_64;
223 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
224 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
228 output[0] = WRAPLOW(step1[0] + step1[7], 8);
229 output[1] = WRAPLOW(step1[1] + step1[6], 8);
230 output[2] = WRAPLOW(step1[2] + step1[5], 8);
231 output[3] = WRAPLOW(step1[3] + step1[4], 8);
232 output[4] = WRAPLOW(step1[3] - step1[4], 8);
233 output[5] = WRAPLOW(step1[2] - step1[5], 8);
234 output[6] = WRAPLOW(step1[1] - step1[6], 8);
235 output[7] = WRAPLOW(step1[0] - step1[7], 8);
238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
239 tran_low_t out[8 * 8];
240 tran_low_t *outptr = out;
242 tran_low_t temp_in[8], temp_out[8];
244 // First transform rows
245 for (i = 0; i < 8; ++i) {
246 idct8(input, outptr);
251 // Then transform columns
252 for (i = 0; i < 8; ++i) {
253 for (j = 0; j < 8; ++j)
254 temp_in[j] = out[j * 8 + i];
255 idct8(temp_in, temp_out);
256 for (j = 0; j < 8; ++j) {
257 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
258 ROUND_POWER_OF_TWO(temp_out[j], 5));
263 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
266 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
267 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
268 a1 = ROUND_POWER_OF_TWO(out, 5);
269 for (j = 0; j < 8; ++j) {
270 for (i = 0; i < 8; ++i)
271 dest[i] = clip_pixel_add(dest[i], a1);
276 static void iadst4(const tran_low_t *input, tran_low_t *output) {
277 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
279 tran_high_t x0 = input[0];
280 tran_high_t x1 = input[1];
281 tran_high_t x2 = input[2];
282 tran_high_t x3 = input[3];
284 if (!(x0 | x1 | x2 | x3)) {
285 output[0] = output[1] = output[2] = output[3] = 0;
308 // 1-D transform scaling factor is sqrt(2).
309 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
310 // + 1b (addition) = 29b.
311 // Hence the output bit depth is 15b.
312 output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
313 output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
314 output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
315 output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
318 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
320 const transform_2d IHT_4[] = {
321 { idct4, idct4 }, // DCT_DCT = 0
322 { iadst4, idct4 }, // ADST_DCT = 1
323 { idct4, iadst4 }, // DCT_ADST = 2
324 { iadst4, iadst4 } // ADST_ADST = 3
328 tran_low_t out[4 * 4];
329 tran_low_t *outptr = out;
330 tran_low_t temp_in[4], temp_out[4];
332 // inverse transform row vectors
333 for (i = 0; i < 4; ++i) {
334 IHT_4[tx_type].rows(input, outptr);
339 // inverse transform column vectors
340 for (i = 0; i < 4; ++i) {
341 for (j = 0; j < 4; ++j)
342 temp_in[j] = out[j * 4 + i];
343 IHT_4[tx_type].cols(temp_in, temp_out);
344 for (j = 0; j < 4; ++j) {
345 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
346 ROUND_POWER_OF_TWO(temp_out[j], 4));
351 static void iadst8(const tran_low_t *input, tran_low_t *output) {
352 int s0, s1, s2, s3, s4, s5, s6, s7;
354 tran_high_t x0 = input[7];
355 tran_high_t x1 = input[0];
356 tran_high_t x2 = input[5];
357 tran_high_t x3 = input[2];
358 tran_high_t x4 = input[3];
359 tran_high_t x5 = input[4];
360 tran_high_t x6 = input[1];
361 tran_high_t x7 = input[6];
363 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
364 output[0] = output[1] = output[2] = output[3] = output[4]
365 = output[5] = output[6] = output[7] = 0;
370 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
371 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
372 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
373 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
374 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
375 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
376 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
377 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
379 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
380 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
381 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
382 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
383 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
384 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
385 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
386 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
393 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
394 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
395 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
396 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
398 x0 = WRAPLOW(s0 + s2, 8);
399 x1 = WRAPLOW(s1 + s3, 8);
400 x2 = WRAPLOW(s0 - s2, 8);
401 x3 = WRAPLOW(s1 - s3, 8);
402 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
403 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
404 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
405 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
408 s2 = cospi_16_64 * (x2 + x3);
409 s3 = cospi_16_64 * (x2 - x3);
410 s6 = cospi_16_64 * (x6 + x7);
411 s7 = cospi_16_64 * (x6 - x7);
413 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
414 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
415 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
416 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
418 output[0] = WRAPLOW(x0, 8);
419 output[1] = WRAPLOW(-x4, 8);
420 output[2] = WRAPLOW(x6, 8);
421 output[3] = WRAPLOW(-x2, 8);
422 output[4] = WRAPLOW(x3, 8);
423 output[5] = WRAPLOW(-x7, 8);
424 output[6] = WRAPLOW(x5, 8);
425 output[7] = WRAPLOW(-x1, 8);
428 static const transform_2d IHT_8[] = {
429 { idct8, idct8 }, // DCT_DCT = 0
430 { iadst8, idct8 }, // ADST_DCT = 1
431 { idct8, iadst8 }, // DCT_ADST = 2
432 { iadst8, iadst8 } // ADST_ADST = 3
435 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
438 tran_low_t out[8 * 8];
439 tran_low_t *outptr = out;
440 tran_low_t temp_in[8], temp_out[8];
441 const transform_2d ht = IHT_8[tx_type];
443 // inverse transform row vectors
444 for (i = 0; i < 8; ++i) {
445 ht.rows(input, outptr);
450 // inverse transform column vectors
451 for (i = 0; i < 8; ++i) {
452 for (j = 0; j < 8; ++j)
453 temp_in[j] = out[j * 8 + i];
454 ht.cols(temp_in, temp_out);
455 for (j = 0; j < 8; ++j) {
456 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
457 ROUND_POWER_OF_TWO(temp_out[j], 5));
462 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
463 tran_low_t out[8 * 8] = { 0 };
464 tran_low_t *outptr = out;
466 tran_low_t temp_in[8], temp_out[8];
468 // First transform rows
469 // only first 4 row has non-zero coefs
470 for (i = 0; i < 4; ++i) {
471 idct8(input, outptr);
476 // Then transform columns
477 for (i = 0; i < 8; ++i) {
478 for (j = 0; j < 8; ++j)
479 temp_in[j] = out[j * 8 + i];
480 idct8(temp_in, temp_out);
481 for (j = 0; j < 8; ++j) {
482 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
483 ROUND_POWER_OF_TWO(temp_out[j], 5));
488 static void idct16(const tran_low_t *input, tran_low_t *output) {
489 tran_low_t step1[16], step2[16];
490 tran_high_t temp1, temp2;
493 step1[0] = input[0/2];
494 step1[1] = input[16/2];
495 step1[2] = input[8/2];
496 step1[3] = input[24/2];
497 step1[4] = input[4/2];
498 step1[5] = input[20/2];
499 step1[6] = input[12/2];
500 step1[7] = input[28/2];
501 step1[8] = input[2/2];
502 step1[9] = input[18/2];
503 step1[10] = input[10/2];
504 step1[11] = input[26/2];
505 step1[12] = input[6/2];
506 step1[13] = input[22/2];
507 step1[14] = input[14/2];
508 step1[15] = input[30/2];
520 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
521 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
522 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
523 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
525 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
526 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
527 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
528 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
530 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
531 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
532 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
533 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
535 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
536 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
537 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
538 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
546 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
547 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
548 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
549 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
550 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
551 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
552 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
553 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
555 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
556 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
557 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
558 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
559 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
560 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
561 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
562 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
565 temp1 = (step1[0] + step1[1]) * cospi_16_64;
566 temp2 = (step1[0] - step1[1]) * cospi_16_64;
567 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
568 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
569 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
570 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
571 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
572 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
573 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
574 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
575 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
576 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
579 step2[15] = step1[15];
580 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
581 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
582 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
583 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
584 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
585 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
586 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
587 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
588 step2[11] = step1[11];
589 step2[12] = step1[12];
592 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
593 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
594 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
595 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
597 temp1 = (step2[6] - step2[5]) * cospi_16_64;
598 temp2 = (step2[5] + step2[6]) * cospi_16_64;
599 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
600 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
603 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
604 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
605 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
606 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
607 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
608 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
609 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
610 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
613 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
614 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
615 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
616 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
617 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
618 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
619 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
620 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
623 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
624 temp2 = (step1[10] + step1[13]) * cospi_16_64;
625 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
626 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
627 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
628 temp2 = (step1[11] + step1[12]) * cospi_16_64;
629 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
630 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
631 step2[14] = step1[14];
632 step2[15] = step1[15];
635 output[0] = WRAPLOW(step2[0] + step2[15], 8);
636 output[1] = WRAPLOW(step2[1] + step2[14], 8);
637 output[2] = WRAPLOW(step2[2] + step2[13], 8);
638 output[3] = WRAPLOW(step2[3] + step2[12], 8);
639 output[4] = WRAPLOW(step2[4] + step2[11], 8);
640 output[5] = WRAPLOW(step2[5] + step2[10], 8);
641 output[6] = WRAPLOW(step2[6] + step2[9], 8);
642 output[7] = WRAPLOW(step2[7] + step2[8], 8);
643 output[8] = WRAPLOW(step2[7] - step2[8], 8);
644 output[9] = WRAPLOW(step2[6] - step2[9], 8);
645 output[10] = WRAPLOW(step2[5] - step2[10], 8);
646 output[11] = WRAPLOW(step2[4] - step2[11], 8);
647 output[12] = WRAPLOW(step2[3] - step2[12], 8);
648 output[13] = WRAPLOW(step2[2] - step2[13], 8);
649 output[14] = WRAPLOW(step2[1] - step2[14], 8);
650 output[15] = WRAPLOW(step2[0] - step2[15], 8);
653 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
655 tran_low_t out[16 * 16];
656 tran_low_t *outptr = out;
658 tran_low_t temp_in[16], temp_out[16];
660 // First transform rows
661 for (i = 0; i < 16; ++i) {
662 idct16(input, outptr);
667 // Then transform columns
668 for (i = 0; i < 16; ++i) {
669 for (j = 0; j < 16; ++j)
670 temp_in[j] = out[j * 16 + i];
671 idct16(temp_in, temp_out);
672 for (j = 0; j < 16; ++j) {
673 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
674 ROUND_POWER_OF_TWO(temp_out[j], 6));
679 static void iadst16(const tran_low_t *input, tran_low_t *output) {
680 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
681 tran_high_t s9, s10, s11, s12, s13, s14, s15;
683 tran_high_t x0 = input[15];
684 tran_high_t x1 = input[0];
685 tran_high_t x2 = input[13];
686 tran_high_t x3 = input[2];
687 tran_high_t x4 = input[11];
688 tran_high_t x5 = input[4];
689 tran_high_t x6 = input[9];
690 tran_high_t x7 = input[6];
691 tran_high_t x8 = input[7];
692 tran_high_t x9 = input[8];
693 tran_high_t x10 = input[5];
694 tran_high_t x11 = input[10];
695 tran_high_t x12 = input[3];
696 tran_high_t x13 = input[12];
697 tran_high_t x14 = input[1];
698 tran_high_t x15 = input[14];
700 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
701 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
702 output[0] = output[1] = output[2] = output[3] = output[4]
703 = output[5] = output[6] = output[7] = output[8]
704 = output[9] = output[10] = output[11] = output[12]
705 = output[13] = output[14] = output[15] = 0;
710 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
711 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
712 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
713 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
714 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
715 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
716 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
717 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
718 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
719 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
720 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
721 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
722 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
723 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
724 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
725 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
727 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
728 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
729 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
730 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
731 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
732 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
733 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
734 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
735 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
736 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
737 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
738 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
739 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
740 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
741 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
742 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
753 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
754 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
755 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
756 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
757 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
758 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
759 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
760 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
762 x0 = WRAPLOW(s0 + s4, 8);
763 x1 = WRAPLOW(s1 + s5, 8);
764 x2 = WRAPLOW(s2 + s6, 8);
765 x3 = WRAPLOW(s3 + s7, 8);
766 x4 = WRAPLOW(s0 - s4, 8);
767 x5 = WRAPLOW(s1 - s5, 8);
768 x6 = WRAPLOW(s2 - s6, 8);
769 x7 = WRAPLOW(s3 - s7, 8);
770 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
771 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
772 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
773 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
774 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
775 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
776 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
777 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
784 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
785 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
786 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
787 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
792 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
793 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
794 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
795 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
797 x0 = WRAPLOW(check_range(s0 + s2), 8);
798 x1 = WRAPLOW(check_range(s1 + s3), 8);
799 x2 = WRAPLOW(check_range(s0 - s2), 8);
800 x3 = WRAPLOW(check_range(s1 - s3), 8);
801 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
802 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
803 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
804 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
805 x8 = WRAPLOW(check_range(s8 + s10), 8);
806 x9 = WRAPLOW(check_range(s9 + s11), 8);
807 x10 = WRAPLOW(check_range(s8 - s10), 8);
808 x11 = WRAPLOW(check_range(s9 - s11), 8);
809 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
810 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
811 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
812 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
815 s2 = (- cospi_16_64) * (x2 + x3);
816 s3 = cospi_16_64 * (x2 - x3);
817 s6 = cospi_16_64 * (x6 + x7);
818 s7 = cospi_16_64 * (- x6 + x7);
819 s10 = cospi_16_64 * (x10 + x11);
820 s11 = cospi_16_64 * (- x10 + x11);
821 s14 = (- cospi_16_64) * (x14 + x15);
822 s15 = cospi_16_64 * (x14 - x15);
824 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
825 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
826 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
827 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
828 x10 = WRAPLOW(dct_const_round_shift(s10), 8);
829 x11 = WRAPLOW(dct_const_round_shift(s11), 8);
830 x14 = WRAPLOW(dct_const_round_shift(s14), 8);
831 x15 = WRAPLOW(dct_const_round_shift(s15), 8);
833 output[0] = WRAPLOW(x0, 8);
834 output[1] = WRAPLOW(-x8, 8);
835 output[2] = WRAPLOW(x12, 8);
836 output[3] = WRAPLOW(-x4, 8);
837 output[4] = WRAPLOW(x6, 8);
838 output[5] = WRAPLOW(x14, 8);
839 output[6] = WRAPLOW(x10, 8);
840 output[7] = WRAPLOW(x2, 8);
841 output[8] = WRAPLOW(x3, 8);
842 output[9] = WRAPLOW(x11, 8);
843 output[10] = WRAPLOW(x15, 8);
844 output[11] = WRAPLOW(x7, 8);
845 output[12] = WRAPLOW(x5, 8);
846 output[13] = WRAPLOW(-x13, 8);
847 output[14] = WRAPLOW(x9, 8);
848 output[15] = WRAPLOW(-x1, 8);
851 static const transform_2d IHT_16[] = {
852 { idct16, idct16 }, // DCT_DCT = 0
853 { iadst16, idct16 }, // ADST_DCT = 1
854 { idct16, iadst16 }, // DCT_ADST = 2
855 { iadst16, iadst16 } // ADST_ADST = 3
858 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
861 tran_low_t out[16 * 16];
862 tran_low_t *outptr = out;
863 tran_low_t temp_in[16], temp_out[16];
864 const transform_2d ht = IHT_16[tx_type];
867 for (i = 0; i < 16; ++i) {
868 ht.rows(input, outptr);
874 for (i = 0; i < 16; ++i) {
875 for (j = 0; j < 16; ++j)
876 temp_in[j] = out[j * 16 + i];
877 ht.cols(temp_in, temp_out);
878 for (j = 0; j < 16; ++j) {
879 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
880 ROUND_POWER_OF_TWO(temp_out[j], 6));
885 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
887 tran_low_t out[16 * 16] = { 0 };
888 tran_low_t *outptr = out;
890 tran_low_t temp_in[16], temp_out[16];
892 // First transform rows. Since all non-zero dct coefficients are in
893 // upper-left 4x4 area, we only need to calculate first 4 rows here.
894 for (i = 0; i < 4; ++i) {
895 idct16(input, outptr);
900 // Then transform columns
901 for (i = 0; i < 16; ++i) {
902 for (j = 0; j < 16; ++j)
903 temp_in[j] = out[j*16 + i];
904 idct16(temp_in, temp_out);
905 for (j = 0; j < 16; ++j) {
906 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
907 ROUND_POWER_OF_TWO(temp_out[j], 6));
912 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
915 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
916 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
917 a1 = ROUND_POWER_OF_TWO(out, 6);
918 for (j = 0; j < 16; ++j) {
919 for (i = 0; i < 16; ++i)
920 dest[i] = clip_pixel_add(dest[i], a1);
925 static void idct32(const tran_low_t *input, tran_low_t *output) {
926 tran_low_t step1[32], step2[32];
927 tran_high_t temp1, temp2;
931 step1[1] = input[16];
933 step1[3] = input[24];
935 step1[5] = input[20];
936 step1[6] = input[12];
937 step1[7] = input[28];
939 step1[9] = input[18];
940 step1[10] = input[10];
941 step1[11] = input[26];
942 step1[12] = input[6];
943 step1[13] = input[22];
944 step1[14] = input[14];
945 step1[15] = input[30];
947 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
948 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
949 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
950 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
952 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
953 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
954 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
955 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
957 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
958 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
959 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
960 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
962 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
963 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
964 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
965 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
967 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
968 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
969 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
970 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
972 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
973 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
974 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
975 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
977 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
978 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
979 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
980 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
982 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
983 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
984 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
985 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
997 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
998 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
999 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
1000 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
1002 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1003 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1004 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
1005 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
1007 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1008 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1009 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1010 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1012 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1013 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1014 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1015 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1017 step2[16] = WRAPLOW(step1[16] + step1[17], 8);
1018 step2[17] = WRAPLOW(step1[16] - step1[17], 8);
1019 step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
1020 step2[19] = WRAPLOW(step1[18] + step1[19], 8);
1021 step2[20] = WRAPLOW(step1[20] + step1[21], 8);
1022 step2[21] = WRAPLOW(step1[20] - step1[21], 8);
1023 step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
1024 step2[23] = WRAPLOW(step1[22] + step1[23], 8);
1025 step2[24] = WRAPLOW(step1[24] + step1[25], 8);
1026 step2[25] = WRAPLOW(step1[24] - step1[25], 8);
1027 step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
1028 step2[27] = WRAPLOW(step1[26] + step1[27], 8);
1029 step2[28] = WRAPLOW(step1[28] + step1[29], 8);
1030 step2[29] = WRAPLOW(step1[28] - step1[29], 8);
1031 step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
1032 step2[31] = WRAPLOW(step1[30] + step1[31], 8);
1035 step1[0] = step2[0];
1036 step1[1] = step2[1];
1037 step1[2] = step2[2];
1038 step1[3] = step2[3];
1040 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1041 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1042 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
1043 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
1044 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1045 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1046 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1047 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1049 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
1050 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
1051 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
1052 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
1053 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
1054 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
1055 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
1056 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
1058 step1[16] = step2[16];
1059 step1[31] = step2[31];
1060 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1061 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1062 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
1063 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
1064 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1065 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1066 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1067 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1068 step1[19] = step2[19];
1069 step1[20] = step2[20];
1070 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1071 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1072 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1073 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1074 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1075 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1076 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1077 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1078 step1[23] = step2[23];
1079 step1[24] = step2[24];
1080 step1[27] = step2[27];
1081 step1[28] = step2[28];
1084 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1085 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1086 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
1087 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
1088 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1089 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1090 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
1091 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
1092 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
1093 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
1094 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
1095 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
1097 step2[8] = step1[8];
1098 step2[15] = step1[15];
1099 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1100 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1101 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
1102 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
1103 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1104 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1105 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1106 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1107 step2[11] = step1[11];
1108 step2[12] = step1[12];
1110 step2[16] = WRAPLOW(step1[16] + step1[19], 8);
1111 step2[17] = WRAPLOW(step1[17] + step1[18], 8);
1112 step2[18] = WRAPLOW(step1[17] - step1[18], 8);
1113 step2[19] = WRAPLOW(step1[16] - step1[19], 8);
1114 step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
1115 step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
1116 step2[22] = WRAPLOW(step1[21] + step1[22], 8);
1117 step2[23] = WRAPLOW(step1[20] + step1[23], 8);
1119 step2[24] = WRAPLOW(step1[24] + step1[27], 8);
1120 step2[25] = WRAPLOW(step1[25] + step1[26], 8);
1121 step2[26] = WRAPLOW(step1[25] - step1[26], 8);
1122 step2[27] = WRAPLOW(step1[24] - step1[27], 8);
1123 step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
1124 step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
1125 step2[30] = WRAPLOW(step1[29] + step1[30], 8);
1126 step2[31] = WRAPLOW(step1[28] + step1[31], 8);
1129 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
1130 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
1131 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
1132 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
1133 step1[4] = step2[4];
1134 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1135 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1136 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1137 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1138 step1[7] = step2[7];
1140 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
1141 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
1142 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1143 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1144 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1145 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1146 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1147 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1149 step1[16] = step2[16];
1150 step1[17] = step2[17];
1151 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1152 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1153 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1154 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1155 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1156 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1157 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1158 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1159 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1160 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1161 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1162 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1163 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1164 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1165 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1166 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1167 step1[22] = step2[22];
1168 step1[23] = step2[23];
1169 step1[24] = step2[24];
1170 step1[25] = step2[25];
1171 step1[30] = step2[30];
1172 step1[31] = step2[31];
1175 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1176 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1177 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1178 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1179 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1180 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1181 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1182 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1183 step2[8] = step1[8];
1184 step2[9] = step1[9];
1185 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1186 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1187 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1188 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1189 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1190 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1191 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1192 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1193 step2[14] = step1[14];
1194 step2[15] = step1[15];
1196 step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1197 step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1198 step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1199 step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1200 step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1201 step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1202 step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1203 step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1205 step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1206 step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1207 step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1208 step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1209 step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1210 step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1211 step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1212 step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1215 step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1216 step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1217 step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1218 step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1219 step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1220 step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1221 step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1222 step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1223 step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1224 step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1225 step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1226 step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1227 step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1228 step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1229 step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1230 step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1232 step1[16] = step2[16];
1233 step1[17] = step2[17];
1234 step1[18] = step2[18];
1235 step1[19] = step2[19];
1236 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1237 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1238 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1239 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1240 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1241 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1242 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1243 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1244 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1245 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1246 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1247 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1248 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1249 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1250 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1251 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1252 step1[28] = step2[28];
1253 step1[29] = step2[29];
1254 step1[30] = step2[30];
1255 step1[31] = step2[31];
1258 output[0] = WRAPLOW(step1[0] + step1[31], 8);
1259 output[1] = WRAPLOW(step1[1] + step1[30], 8);
1260 output[2] = WRAPLOW(step1[2] + step1[29], 8);
1261 output[3] = WRAPLOW(step1[3] + step1[28], 8);
1262 output[4] = WRAPLOW(step1[4] + step1[27], 8);
1263 output[5] = WRAPLOW(step1[5] + step1[26], 8);
1264 output[6] = WRAPLOW(step1[6] + step1[25], 8);
1265 output[7] = WRAPLOW(step1[7] + step1[24], 8);
1266 output[8] = WRAPLOW(step1[8] + step1[23], 8);
1267 output[9] = WRAPLOW(step1[9] + step1[22], 8);
1268 output[10] = WRAPLOW(step1[10] + step1[21], 8);
1269 output[11] = WRAPLOW(step1[11] + step1[20], 8);
1270 output[12] = WRAPLOW(step1[12] + step1[19], 8);
1271 output[13] = WRAPLOW(step1[13] + step1[18], 8);
1272 output[14] = WRAPLOW(step1[14] + step1[17], 8);
1273 output[15] = WRAPLOW(step1[15] + step1[16], 8);
1274 output[16] = WRAPLOW(step1[15] - step1[16], 8);
1275 output[17] = WRAPLOW(step1[14] - step1[17], 8);
1276 output[18] = WRAPLOW(step1[13] - step1[18], 8);
1277 output[19] = WRAPLOW(step1[12] - step1[19], 8);
1278 output[20] = WRAPLOW(step1[11] - step1[20], 8);
1279 output[21] = WRAPLOW(step1[10] - step1[21], 8);
1280 output[22] = WRAPLOW(step1[9] - step1[22], 8);
1281 output[23] = WRAPLOW(step1[8] - step1[23], 8);
1282 output[24] = WRAPLOW(step1[7] - step1[24], 8);
1283 output[25] = WRAPLOW(step1[6] - step1[25], 8);
1284 output[26] = WRAPLOW(step1[5] - step1[26], 8);
1285 output[27] = WRAPLOW(step1[4] - step1[27], 8);
1286 output[28] = WRAPLOW(step1[3] - step1[28], 8);
1287 output[29] = WRAPLOW(step1[2] - step1[29], 8);
1288 output[30] = WRAPLOW(step1[1] - step1[30], 8);
1289 output[31] = WRAPLOW(step1[0] - step1[31], 8);
1292 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1294 tran_low_t out[32 * 32];
1295 tran_low_t *outptr = out;
1297 tran_low_t temp_in[32], temp_out[32];
1300 for (i = 0; i < 32; ++i) {
1301 int16_t zero_coeff[16];
1302 for (j = 0; j < 16; ++j)
1303 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1304 for (j = 0; j < 8; ++j)
1305 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1306 for (j = 0; j < 4; ++j)
1307 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1308 for (j = 0; j < 2; ++j)
1309 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1311 if (zero_coeff[0] | zero_coeff[1])
1312 idct32(input, outptr);
1314 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
1320 for (i = 0; i < 32; ++i) {
1321 for (j = 0; j < 32; ++j)
1322 temp_in[j] = out[j * 32 + i];
1323 idct32(temp_in, temp_out);
1324 for (j = 0; j < 32; ++j) {
1325 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1326 ROUND_POWER_OF_TWO(temp_out[j], 6));
1331 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1333 tran_low_t out[32 * 32] = {0};
1334 tran_low_t *outptr = out;
1336 tran_low_t temp_in[32], temp_out[32];
1339 // only upper-left 8x8 has non-zero coeff
1340 for (i = 0; i < 8; ++i) {
1341 idct32(input, outptr);
1347 for (i = 0; i < 32; ++i) {
1348 for (j = 0; j < 32; ++j)
1349 temp_in[j] = out[j * 32 + i];
1350 idct32(temp_in, temp_out);
1351 for (j = 0; j < 32; ++j) {
1352 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1353 ROUND_POWER_OF_TWO(temp_out[j], 6));
1358 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1362 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1363 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1364 a1 = ROUND_POWER_OF_TWO(out, 6);
1366 for (j = 0; j < 32; ++j) {
1367 for (i = 0; i < 32; ++i)
1368 dest[i] = clip_pixel_add(dest[i], a1);
1374 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
1377 vp9_idct4x4_16_add(input, dest, stride);
1379 vp9_idct4x4_1_add(input, dest, stride);
1383 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
1386 vp9_iwht4x4_16_add(input, dest, stride);
1388 vp9_iwht4x4_1_add(input, dest, stride);
1391 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
1393 // If dc is 1, then input[0] is the reconstructed value, do not need
1394 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1396 // The calculation can be simplified if there are not many non-zero dct
1397 // coefficients. Use eobs to decide what to do.
1398 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1399 // Combine that with code here.
1401 // DC only DCT coefficient
1402 vp9_idct8x8_1_add(input, dest, stride);
1404 vp9_idct8x8_12_add(input, dest, stride);
1406 vp9_idct8x8_64_add(input, dest, stride);
1409 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
1411 /* The calculation can be simplified if there are not many non-zero dct
1412 * coefficients. Use eobs to separate different cases. */
1414 /* DC only DCT coefficient. */
1415 vp9_idct16x16_1_add(input, dest, stride);
1417 vp9_idct16x16_10_add(input, dest, stride);
1419 vp9_idct16x16_256_add(input, dest, stride);
1422 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
1425 vp9_idct32x32_1_add(input, dest, stride);
1427 // non-zero coeff only in upper-left 8x8
1428 vp9_idct32x32_34_add(input, dest, stride);
1430 vp9_idct32x32_1024_add(input, dest, stride);
1434 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1435 int stride, int eob) {
1436 if (tx_type == DCT_DCT)
1437 vp9_idct4x4_add(input, dest, stride, eob);
1439 vp9_iht4x4_16_add(input, dest, stride, tx_type);
1442 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1443 int stride, int eob) {
1444 if (tx_type == DCT_DCT) {
1445 vp9_idct8x8_add(input, dest, stride, eob);
1447 vp9_iht8x8_64_add(input, dest, stride, tx_type);
1451 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1452 int stride, int eob) {
1453 if (tx_type == DCT_DCT) {
1454 vp9_idct16x16_add(input, dest, stride, eob);
1456 vp9_iht16x16_256_add(input, dest, stride, tx_type);
1460 #if CONFIG_VP9_HIGHBITDEPTH
1461 void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1462 int stride, int bd) {
1463 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1464 0.5 shifts per pixel. */
1466 tran_low_t output[16];
1467 tran_high_t a1, b1, c1, d1, e1;
1468 const tran_low_t *ip = input;
1469 tran_low_t *op = output;
1470 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1472 for (i = 0; i < 4; i++) {
1473 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1474 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1475 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1476 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1479 e1 = (a1 - d1) >> 1;
1484 op[0] = WRAPLOW(a1, bd);
1485 op[1] = WRAPLOW(b1, bd);
1486 op[2] = WRAPLOW(c1, bd);
1487 op[3] = WRAPLOW(d1, bd);
1493 for (i = 0; i < 4; i++) {
1500 e1 = (a1 - d1) >> 1;
1505 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1506 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1507 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1508 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1515 void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1516 int dest_stride, int bd) {
1520 const tran_low_t *ip = in;
1521 tran_low_t *op = tmp;
1522 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1525 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1528 op[0] = WRAPLOW(a1, bd);
1529 op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1532 for (i = 0; i < 4; i++) {
1535 dest[dest_stride * 0] = highbd_clip_pixel_add(
1536 dest[dest_stride * 0], a1, bd);
1537 dest[dest_stride * 1] = highbd_clip_pixel_add(
1538 dest[dest_stride * 1], e1, bd);
1539 dest[dest_stride * 2] = highbd_clip_pixel_add(
1540 dest[dest_stride * 2], e1, bd);
1541 dest[dest_stride * 3] = highbd_clip_pixel_add(
1542 dest[dest_stride * 3], e1, bd);
1548 static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
1550 tran_high_t temp1, temp2;
1553 temp1 = (input[0] + input[2]) * cospi_16_64;
1554 temp2 = (input[0] - input[2]) * cospi_16_64;
1555 step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
1556 step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
1557 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1558 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1559 step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
1560 step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
1563 output[0] = WRAPLOW(step[0] + step[3], bd);
1564 output[1] = WRAPLOW(step[1] + step[2], bd);
1565 output[2] = WRAPLOW(step[1] - step[2], bd);
1566 output[3] = WRAPLOW(step[0] - step[3], bd);
1569 void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1570 int stride, int bd) {
1571 tran_low_t out[4 * 4];
1572 tran_low_t *outptr = out;
1574 tran_low_t temp_in[4], temp_out[4];
1575 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1578 for (i = 0; i < 4; ++i) {
1579 highbd_idct4(input, outptr, bd);
1585 for (i = 0; i < 4; ++i) {
1586 for (j = 0; j < 4; ++j)
1587 temp_in[j] = out[j * 4 + i];
1588 highbd_idct4(temp_in, temp_out, bd);
1589 for (j = 0; j < 4; ++j) {
1590 dest[j * stride + i] = highbd_clip_pixel_add(
1591 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1596 void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1597 int dest_stride, int bd) {
1600 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1601 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1603 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1604 a1 = ROUND_POWER_OF_TWO(out, 4);
1606 for (i = 0; i < 4; i++) {
1607 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1608 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1609 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1610 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1611 dest += dest_stride;
1615 static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
1616 tran_low_t step1[8], step2[8];
1617 tran_high_t temp1, temp2;
1619 step1[0] = input[0];
1620 step1[2] = input[4];
1621 step1[1] = input[2];
1622 step1[3] = input[6];
1623 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1624 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1625 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
1626 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
1627 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1628 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1629 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1630 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1632 // stage 2 & stage 3 - even half
1633 highbd_idct4(step1, step1, bd);
1635 // stage 2 - odd half
1636 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1637 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1638 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1639 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1641 // stage 3 - odd half
1642 step1[4] = step2[4];
1643 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1644 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1645 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1646 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1647 step1[7] = step2[7];
1650 output[0] = WRAPLOW(step1[0] + step1[7], bd);
1651 output[1] = WRAPLOW(step1[1] + step1[6], bd);
1652 output[2] = WRAPLOW(step1[2] + step1[5], bd);
1653 output[3] = WRAPLOW(step1[3] + step1[4], bd);
1654 output[4] = WRAPLOW(step1[3] - step1[4], bd);
1655 output[5] = WRAPLOW(step1[2] - step1[5], bd);
1656 output[6] = WRAPLOW(step1[1] - step1[6], bd);
1657 output[7] = WRAPLOW(step1[0] - step1[7], bd);
1660 void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1661 int stride, int bd) {
1662 tran_low_t out[8 * 8];
1663 tran_low_t *outptr = out;
1665 tran_low_t temp_in[8], temp_out[8];
1666 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1668 // First transform rows.
1669 for (i = 0; i < 8; ++i) {
1670 highbd_idct8(input, outptr, bd);
1675 // Then transform columns.
1676 for (i = 0; i < 8; ++i) {
1677 for (j = 0; j < 8; ++j)
1678 temp_in[j] = out[j * 8 + i];
1679 highbd_idct8(temp_in, temp_out, bd);
1680 for (j = 0; j < 8; ++j) {
1681 dest[j * stride + i] = highbd_clip_pixel_add(
1682 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1687 void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1688 int stride, int bd) {
1691 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1692 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1693 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1694 a1 = ROUND_POWER_OF_TWO(out, 5);
1695 for (j = 0; j < 8; ++j) {
1696 for (i = 0; i < 8; ++i)
1697 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1702 static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
1703 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1705 tran_high_t x0 = input[0];
1706 tran_high_t x1 = input[1];
1707 tran_high_t x2 = input[2];
1708 tran_high_t x3 = input[3];
1711 if (!(x0 | x1 | x2 | x3)) {
1712 vpx_memset(output, 0, 4 * sizeof(*output));
1716 s0 = sinpi_1_9 * x0;
1717 s1 = sinpi_2_9 * x0;
1718 s2 = sinpi_3_9 * x1;
1719 s3 = sinpi_4_9 * x2;
1720 s4 = sinpi_1_9 * x2;
1721 s5 = sinpi_2_9 * x3;
1722 s6 = sinpi_4_9 * x3;
1727 x2 = sinpi_3_9 * s7;
1735 // 1-D transform scaling factor is sqrt(2).
1736 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1737 // + 1b (addition) = 29b.
1738 // Hence the output bit depth is 15b.
1739 output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
1740 output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
1741 output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
1742 output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
1745 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1746 int stride, int tx_type, int bd) {
1747 const highbd_transform_2d IHT_4[] = {
1748 { highbd_idct4, highbd_idct4 }, // DCT_DCT = 0
1749 { highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1
1750 { highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
1751 { highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3
1753 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1756 tran_low_t out[4 * 4];
1757 tran_low_t *outptr = out;
1758 tran_low_t temp_in[4], temp_out[4];
1760 // Inverse transform row vectors.
1761 for (i = 0; i < 4; ++i) {
1762 IHT_4[tx_type].rows(input, outptr, bd);
1767 // Inverse transform column vectors.
1768 for (i = 0; i < 4; ++i) {
1769 for (j = 0; j < 4; ++j)
1770 temp_in[j] = out[j * 4 + i];
1771 IHT_4[tx_type].cols(temp_in, temp_out, bd);
1772 for (j = 0; j < 4; ++j) {
1773 dest[j * stride + i] = highbd_clip_pixel_add(
1774 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1779 static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
1780 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1782 tran_high_t x0 = input[7];
1783 tran_high_t x1 = input[0];
1784 tran_high_t x2 = input[5];
1785 tran_high_t x3 = input[2];
1786 tran_high_t x4 = input[3];
1787 tran_high_t x5 = input[4];
1788 tran_high_t x6 = input[1];
1789 tran_high_t x7 = input[6];
1792 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1793 vpx_memset(output, 0, 8 * sizeof(*output));
1798 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1799 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1800 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1801 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1802 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1803 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1804 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1805 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1807 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1808 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1809 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1810 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1811 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1812 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1813 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1814 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1821 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1822 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1823 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1824 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1826 x0 = WRAPLOW(s0 + s2, bd);
1827 x1 = WRAPLOW(s1 + s3, bd);
1828 x2 = WRAPLOW(s0 - s2, bd);
1829 x3 = WRAPLOW(s1 - s3, bd);
1830 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1831 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1832 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1833 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1836 s2 = cospi_16_64 * (x2 + x3);
1837 s3 = cospi_16_64 * (x2 - x3);
1838 s6 = cospi_16_64 * (x6 + x7);
1839 s7 = cospi_16_64 * (x6 - x7);
1841 x2 = WRAPLOW(dct_const_round_shift(s2), bd);
1842 x3 = WRAPLOW(dct_const_round_shift(s3), bd);
1843 x6 = WRAPLOW(dct_const_round_shift(s6), bd);
1844 x7 = WRAPLOW(dct_const_round_shift(s7), bd);
1846 output[0] = WRAPLOW(x0, bd);
1847 output[1] = WRAPLOW(-x4, bd);
1848 output[2] = WRAPLOW(x6, bd);
1849 output[3] = WRAPLOW(-x2, bd);
1850 output[4] = WRAPLOW(x3, bd);
1851 output[5] = WRAPLOW(-x7, bd);
1852 output[6] = WRAPLOW(x5, bd);
1853 output[7] = WRAPLOW(-x1, bd);
1856 static const highbd_transform_2d HIGH_IHT_8[] = {
1857 { highbd_idct8, highbd_idct8 }, // DCT_DCT = 0
1858 { highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1
1859 { highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
1860 { highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3
1863 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1864 int stride, int tx_type, int bd) {
1866 tran_low_t out[8 * 8];
1867 tran_low_t *outptr = out;
1868 tran_low_t temp_in[8], temp_out[8];
1869 const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
1870 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1872 // Inverse transform row vectors.
1873 for (i = 0; i < 8; ++i) {
1874 ht.rows(input, outptr, bd);
1879 // Inverse transform column vectors.
1880 for (i = 0; i < 8; ++i) {
1881 for (j = 0; j < 8; ++j)
1882 temp_in[j] = out[j * 8 + i];
1883 ht.cols(temp_in, temp_out, bd);
1884 for (j = 0; j < 8; ++j) {
1885 dest[j * stride + i] = highbd_clip_pixel_add(
1886 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1891 void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1892 int stride, int bd) {
1893 tran_low_t out[8 * 8] = { 0 };
1894 tran_low_t *outptr = out;
1896 tran_low_t temp_in[8], temp_out[8];
1897 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1899 // First transform rows.
1900 // Only first 4 row has non-zero coefs.
1901 for (i = 0; i < 4; ++i) {
1902 highbd_idct8(input, outptr, bd);
1906 // Then transform columns.
1907 for (i = 0; i < 8; ++i) {
1908 for (j = 0; j < 8; ++j)
1909 temp_in[j] = out[j * 8 + i];
1910 highbd_idct8(temp_in, temp_out, bd);
1911 for (j = 0; j < 8; ++j) {
1912 dest[j * stride + i] = highbd_clip_pixel_add(
1913 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1918 static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
1919 tran_low_t step1[16], step2[16];
1920 tran_high_t temp1, temp2;
1924 step1[0] = input[0/2];
1925 step1[1] = input[16/2];
1926 step1[2] = input[8/2];
1927 step1[3] = input[24/2];
1928 step1[4] = input[4/2];
1929 step1[5] = input[20/2];
1930 step1[6] = input[12/2];
1931 step1[7] = input[28/2];
1932 step1[8] = input[2/2];
1933 step1[9] = input[18/2];
1934 step1[10] = input[10/2];
1935 step1[11] = input[26/2];
1936 step1[12] = input[6/2];
1937 step1[13] = input[22/2];
1938 step1[14] = input[14/2];
1939 step1[15] = input[30/2];
1942 step2[0] = step1[0];
1943 step2[1] = step1[1];
1944 step2[2] = step1[2];
1945 step2[3] = step1[3];
1946 step2[4] = step1[4];
1947 step2[5] = step1[5];
1948 step2[6] = step1[6];
1949 step2[7] = step1[7];
1951 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1952 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1953 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
1954 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
1956 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1957 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1958 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
1959 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
1961 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1962 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1963 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
1964 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
1966 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1967 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1968 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
1969 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
1972 step1[0] = step2[0];
1973 step1[1] = step2[1];
1974 step1[2] = step2[2];
1975 step1[3] = step2[3];
1977 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1978 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1979 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
1980 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
1981 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1982 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1983 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1984 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1986 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1987 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1988 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1989 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1990 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1991 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1992 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1993 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1996 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1997 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1998 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
1999 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
2000 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2001 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2002 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
2003 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
2004 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2005 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2006 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2007 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2009 step2[8] = step1[8];
2010 step2[15] = step1[15];
2011 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2012 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2013 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
2014 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
2015 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2016 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2017 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2018 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2019 step2[11] = step1[11];
2020 step2[12] = step1[12];
2023 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2024 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2025 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2026 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2027 step1[4] = step2[4];
2028 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2029 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2030 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2031 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2032 step1[7] = step2[7];
2034 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2035 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2036 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2037 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2038 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2039 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2040 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2041 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2044 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2045 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2046 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2047 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2048 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2049 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2050 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2051 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2052 step2[8] = step1[8];
2053 step2[9] = step1[9];
2054 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2055 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2056 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2057 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2058 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2059 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2060 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2061 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2062 step2[14] = step1[14];
2063 step2[15] = step1[15];
2066 output[0] = WRAPLOW(step2[0] + step2[15], bd);
2067 output[1] = WRAPLOW(step2[1] + step2[14], bd);
2068 output[2] = WRAPLOW(step2[2] + step2[13], bd);
2069 output[3] = WRAPLOW(step2[3] + step2[12], bd);
2070 output[4] = WRAPLOW(step2[4] + step2[11], bd);
2071 output[5] = WRAPLOW(step2[5] + step2[10], bd);
2072 output[6] = WRAPLOW(step2[6] + step2[9], bd);
2073 output[7] = WRAPLOW(step2[7] + step2[8], bd);
2074 output[8] = WRAPLOW(step2[7] - step2[8], bd);
2075 output[9] = WRAPLOW(step2[6] - step2[9], bd);
2076 output[10] = WRAPLOW(step2[5] - step2[10], bd);
2077 output[11] = WRAPLOW(step2[4] - step2[11], bd);
2078 output[12] = WRAPLOW(step2[3] - step2[12], bd);
2079 output[13] = WRAPLOW(step2[2] - step2[13], bd);
2080 output[14] = WRAPLOW(step2[1] - step2[14], bd);
2081 output[15] = WRAPLOW(step2[0] - step2[15], bd);
2084 void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2085 int stride, int bd) {
2086 tran_low_t out[16 * 16];
2087 tran_low_t *outptr = out;
2089 tran_low_t temp_in[16], temp_out[16];
2090 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2092 // First transform rows.
2093 for (i = 0; i < 16; ++i) {
2094 highbd_idct16(input, outptr, bd);
2099 // Then transform columns.
2100 for (i = 0; i < 16; ++i) {
2101 for (j = 0; j < 16; ++j)
2102 temp_in[j] = out[j * 16 + i];
2103 highbd_idct16(temp_in, temp_out, bd);
2104 for (j = 0; j < 16; ++j) {
2105 dest[j * stride + i] = highbd_clip_pixel_add(
2106 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2111 static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
2113 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
2114 tran_high_t s9, s10, s11, s12, s13, s14, s15;
2116 tran_high_t x0 = input[15];
2117 tran_high_t x1 = input[0];
2118 tran_high_t x2 = input[13];
2119 tran_high_t x3 = input[2];
2120 tran_high_t x4 = input[11];
2121 tran_high_t x5 = input[4];
2122 tran_high_t x6 = input[9];
2123 tran_high_t x7 = input[6];
2124 tran_high_t x8 = input[7];
2125 tran_high_t x9 = input[8];
2126 tran_high_t x10 = input[5];
2127 tran_high_t x11 = input[10];
2128 tran_high_t x12 = input[3];
2129 tran_high_t x13 = input[12];
2130 tran_high_t x14 = input[1];
2131 tran_high_t x15 = input[14];
2134 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
2135 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
2136 vpx_memset(output, 0, 16 * sizeof(*output));
2141 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
2142 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
2143 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
2144 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
2145 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
2146 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
2147 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
2148 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
2149 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
2150 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
2151 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
2152 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
2153 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
2154 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
2155 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
2156 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
2158 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
2159 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
2160 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
2161 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
2162 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
2163 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
2164 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
2165 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
2166 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
2167 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
2168 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
2169 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
2170 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
2171 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
2172 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
2173 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
2184 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
2185 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
2186 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
2187 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
2188 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
2189 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
2190 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
2191 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
2193 x0 = WRAPLOW(s0 + s4, bd);
2194 x1 = WRAPLOW(s1 + s5, bd);
2195 x2 = WRAPLOW(s2 + s6, bd);
2196 x3 = WRAPLOW(s3 + s7, bd);
2197 x4 = WRAPLOW(s0 - s4, bd);
2198 x5 = WRAPLOW(s1 - s5, bd);
2199 x6 = WRAPLOW(s2 - s6, bd);
2200 x7 = WRAPLOW(s3 - s7, bd);
2201 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
2202 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
2203 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
2204 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
2205 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
2206 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
2207 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
2208 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
2215 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
2216 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
2217 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
2218 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
2223 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2224 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2225 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2226 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2228 x0 = WRAPLOW(s0 + s2, bd);
2229 x1 = WRAPLOW(s1 + s3, bd);
2230 x2 = WRAPLOW(s0 - s2, bd);
2231 x3 = WRAPLOW(s1 - s3, bd);
2232 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
2233 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
2234 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
2235 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
2236 x8 = WRAPLOW(s8 + s10, bd);
2237 x9 = WRAPLOW(s9 + s11, bd);
2238 x10 = WRAPLOW(s8 - s10, bd);
2239 x11 = WRAPLOW(s9 - s11, bd);
2240 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
2241 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
2242 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
2243 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
2246 s2 = (- cospi_16_64) * (x2 + x3);
2247 s3 = cospi_16_64 * (x2 - x3);
2248 s6 = cospi_16_64 * (x6 + x7);
2249 s7 = cospi_16_64 * (-x6 + x7);
2250 s10 = cospi_16_64 * (x10 + x11);
2251 s11 = cospi_16_64 * (-x10 + x11);
2252 s14 = (- cospi_16_64) * (x14 + x15);
2253 s15 = cospi_16_64 * (x14 - x15);
2255 x2 = WRAPLOW(dct_const_round_shift(s2), bd);
2256 x3 = WRAPLOW(dct_const_round_shift(s3), bd);
2257 x6 = WRAPLOW(dct_const_round_shift(s6), bd);
2258 x7 = WRAPLOW(dct_const_round_shift(s7), bd);
2259 x10 = WRAPLOW(dct_const_round_shift(s10), bd);
2260 x11 = WRAPLOW(dct_const_round_shift(s11), bd);
2261 x14 = WRAPLOW(dct_const_round_shift(s14), bd);
2262 x15 = WRAPLOW(dct_const_round_shift(s15), bd);
2264 output[0] = WRAPLOW(x0, bd);
2265 output[1] = WRAPLOW(-x8, bd);
2266 output[2] = WRAPLOW(x12, bd);
2267 output[3] = WRAPLOW(-x4, bd);
2268 output[4] = WRAPLOW(x6, bd);
2269 output[5] = WRAPLOW(x14, bd);
2270 output[6] = WRAPLOW(x10, bd);
2271 output[7] = WRAPLOW(x2, bd);
2272 output[8] = WRAPLOW(x3, bd);
2273 output[9] = WRAPLOW(x11, bd);
2274 output[10] = WRAPLOW(x15, bd);
2275 output[11] = WRAPLOW(x7, bd);
2276 output[12] = WRAPLOW(x5, bd);
2277 output[13] = WRAPLOW(-x13, bd);
2278 output[14] = WRAPLOW(x9, bd);
2279 output[15] = WRAPLOW(-x1, bd);
2282 static const highbd_transform_2d HIGH_IHT_16[] = {
2283 { highbd_idct16, highbd_idct16 }, // DCT_DCT = 0
2284 { highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1
2285 { highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
2286 { highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3
2289 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2290 int stride, int tx_type, int bd) {
2292 tran_low_t out[16 * 16];
2293 tran_low_t *outptr = out;
2294 tran_low_t temp_in[16], temp_out[16];
2295 const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
2296 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2299 for (i = 0; i < 16; ++i) {
2300 ht.rows(input, outptr, bd);
2306 for (i = 0; i < 16; ++i) {
2307 for (j = 0; j < 16; ++j)
2308 temp_in[j] = out[j * 16 + i];
2309 ht.cols(temp_in, temp_out, bd);
2310 for (j = 0; j < 16; ++j) {
2311 dest[j * stride + i] = highbd_clip_pixel_add(
2312 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2317 void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2318 int stride, int bd) {
2319 tran_low_t out[16 * 16] = { 0 };
2320 tran_low_t *outptr = out;
2322 tran_low_t temp_in[16], temp_out[16];
2323 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2325 // First transform rows. Since all non-zero dct coefficients are in
2326 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2327 for (i = 0; i < 4; ++i) {
2328 highbd_idct16(input, outptr, bd);
2333 // Then transform columns.
2334 for (i = 0; i < 16; ++i) {
2335 for (j = 0; j < 16; ++j)
2336 temp_in[j] = out[j*16 + i];
2337 highbd_idct16(temp_in, temp_out, bd);
2338 for (j = 0; j < 16; ++j) {
2339 dest[j * stride + i] = highbd_clip_pixel_add(
2340 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2345 void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2346 int stride, int bd) {
2349 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2350 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2352 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2353 a1 = ROUND_POWER_OF_TWO(out, 6);
2354 for (j = 0; j < 16; ++j) {
2355 for (i = 0; i < 16; ++i)
2356 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2361 static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
2362 tran_low_t step1[32], step2[32];
2363 tran_high_t temp1, temp2;
2367 step1[0] = input[0];
2368 step1[1] = input[16];
2369 step1[2] = input[8];
2370 step1[3] = input[24];
2371 step1[4] = input[4];
2372 step1[5] = input[20];
2373 step1[6] = input[12];
2374 step1[7] = input[28];
2375 step1[8] = input[2];
2376 step1[9] = input[18];
2377 step1[10] = input[10];
2378 step1[11] = input[26];
2379 step1[12] = input[6];
2380 step1[13] = input[22];
2381 step1[14] = input[14];
2382 step1[15] = input[30];
2384 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2385 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2386 step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
2387 step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
2389 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2390 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2391 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
2392 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
2394 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2395 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2396 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2397 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2399 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2400 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2401 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
2402 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
2404 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2405 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2406 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2407 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2409 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2410 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2411 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2412 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2414 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2415 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2416 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2417 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2419 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2420 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2421 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
2422 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
2425 step2[0] = step1[0];
2426 step2[1] = step1[1];
2427 step2[2] = step1[2];
2428 step2[3] = step1[3];
2429 step2[4] = step1[4];
2430 step2[5] = step1[5];
2431 step2[6] = step1[6];
2432 step2[7] = step1[7];
2434 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2435 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2436 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
2437 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
2439 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2440 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2441 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
2442 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
2444 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2445 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2446 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2447 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2449 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2450 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2451 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2452 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2454 step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2455 step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2456 step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2457 step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2458 step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2459 step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2460 step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2461 step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2462 step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2463 step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2464 step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2465 step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2466 step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2467 step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2468 step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2469 step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2472 step1[0] = step2[0];
2473 step1[1] = step2[1];
2474 step1[2] = step2[2];
2475 step1[3] = step2[3];
2477 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2478 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2479 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
2480 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
2481 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2482 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2483 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2484 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2486 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2487 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2488 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2489 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2490 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2491 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2492 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2493 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2495 step1[16] = step2[16];
2496 step1[31] = step2[31];
2497 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2498 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2499 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
2500 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
2501 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2502 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2503 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2504 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2505 step1[19] = step2[19];
2506 step1[20] = step2[20];
2507 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2508 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2509 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2510 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2511 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2512 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2513 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2514 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2515 step1[23] = step2[23];
2516 step1[24] = step2[24];
2517 step1[27] = step2[27];
2518 step1[28] = step2[28];
2521 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2522 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2523 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
2524 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
2525 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2526 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2527 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
2528 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
2529 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2530 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2531 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2532 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2534 step2[8] = step1[8];
2535 step2[15] = step1[15];
2536 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2537 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2538 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
2539 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
2540 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2541 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2542 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2543 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2544 step2[11] = step1[11];
2545 step2[12] = step1[12];
2547 step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2548 step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2549 step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2550 step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2551 step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2552 step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2553 step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2554 step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2556 step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2557 step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2558 step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2559 step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2560 step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2561 step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2562 step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2563 step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2566 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2567 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2568 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2569 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2570 step1[4] = step2[4];
2571 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2572 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2573 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2574 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2575 step1[7] = step2[7];
2577 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2578 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2579 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2580 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2581 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2582 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2583 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2584 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2586 step1[16] = step2[16];
2587 step1[17] = step2[17];
2588 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2589 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2590 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2591 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2592 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2593 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2594 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
2595 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
2596 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2597 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2598 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2599 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2600 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2601 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2602 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2603 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2604 step1[22] = step2[22];
2605 step1[23] = step2[23];
2606 step1[24] = step2[24];
2607 step1[25] = step2[25];
2608 step1[30] = step2[30];
2609 step1[31] = step2[31];
2612 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2613 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2614 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2615 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2616 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2617 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2618 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2619 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2620 step2[8] = step1[8];
2621 step2[9] = step1[9];
2622 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2623 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2624 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2625 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2626 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2627 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2628 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2629 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2630 step2[14] = step1[14];
2631 step2[15] = step1[15];
2633 step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2634 step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2635 step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2636 step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2637 step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2638 step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2639 step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2640 step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2642 step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2643 step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2644 step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2645 step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2646 step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2647 step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2648 step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2649 step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2652 step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2653 step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2654 step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2655 step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2656 step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2657 step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2658 step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2659 step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2660 step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2661 step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2662 step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2663 step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2664 step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2665 step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2666 step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2667 step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2669 step1[16] = step2[16];
2670 step1[17] = step2[17];
2671 step1[18] = step2[18];
2672 step1[19] = step2[19];
2673 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2674 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2675 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2676 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2677 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2678 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2679 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2680 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2681 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2682 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2683 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2684 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2685 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2686 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2687 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
2688 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
2689 step1[28] = step2[28];
2690 step1[29] = step2[29];
2691 step1[30] = step2[30];
2692 step1[31] = step2[31];
2695 output[0] = WRAPLOW(step1[0] + step1[31], bd);
2696 output[1] = WRAPLOW(step1[1] + step1[30], bd);
2697 output[2] = WRAPLOW(step1[2] + step1[29], bd);
2698 output[3] = WRAPLOW(step1[3] + step1[28], bd);
2699 output[4] = WRAPLOW(step1[4] + step1[27], bd);
2700 output[5] = WRAPLOW(step1[5] + step1[26], bd);
2701 output[6] = WRAPLOW(step1[6] + step1[25], bd);
2702 output[7] = WRAPLOW(step1[7] + step1[24], bd);
2703 output[8] = WRAPLOW(step1[8] + step1[23], bd);
2704 output[9] = WRAPLOW(step1[9] + step1[22], bd);
2705 output[10] = WRAPLOW(step1[10] + step1[21], bd);
2706 output[11] = WRAPLOW(step1[11] + step1[20], bd);
2707 output[12] = WRAPLOW(step1[12] + step1[19], bd);
2708 output[13] = WRAPLOW(step1[13] + step1[18], bd);
2709 output[14] = WRAPLOW(step1[14] + step1[17], bd);
2710 output[15] = WRAPLOW(step1[15] + step1[16], bd);
2711 output[16] = WRAPLOW(step1[15] - step1[16], bd);
2712 output[17] = WRAPLOW(step1[14] - step1[17], bd);
2713 output[18] = WRAPLOW(step1[13] - step1[18], bd);
2714 output[19] = WRAPLOW(step1[12] - step1[19], bd);
2715 output[20] = WRAPLOW(step1[11] - step1[20], bd);
2716 output[21] = WRAPLOW(step1[10] - step1[21], bd);
2717 output[22] = WRAPLOW(step1[9] - step1[22], bd);
2718 output[23] = WRAPLOW(step1[8] - step1[23], bd);
2719 output[24] = WRAPLOW(step1[7] - step1[24], bd);
2720 output[25] = WRAPLOW(step1[6] - step1[25], bd);
2721 output[26] = WRAPLOW(step1[5] - step1[26], bd);
2722 output[27] = WRAPLOW(step1[4] - step1[27], bd);
2723 output[28] = WRAPLOW(step1[3] - step1[28], bd);
2724 output[29] = WRAPLOW(step1[2] - step1[29], bd);
2725 output[30] = WRAPLOW(step1[1] - step1[30], bd);
2726 output[31] = WRAPLOW(step1[0] - step1[31], bd);
2729 void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2730 int stride, int bd) {
2731 tran_low_t out[32 * 32];
2732 tran_low_t *outptr = out;
2734 tran_low_t temp_in[32], temp_out[32];
2735 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2738 for (i = 0; i < 32; ++i) {
2739 tran_low_t zero_coeff[16];
2740 for (j = 0; j < 16; ++j)
2741 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2742 for (j = 0; j < 8; ++j)
2743 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2744 for (j = 0; j < 4; ++j)
2745 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2746 for (j = 0; j < 2; ++j)
2747 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2749 if (zero_coeff[0] | zero_coeff[1])
2750 highbd_idct32(input, outptr, bd);
2752 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
2758 for (i = 0; i < 32; ++i) {
2759 for (j = 0; j < 32; ++j)
2760 temp_in[j] = out[j * 32 + i];
2761 highbd_idct32(temp_in, temp_out, bd);
2762 for (j = 0; j < 32; ++j) {
2763 dest[j * stride + i] = highbd_clip_pixel_add(
2764 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2769 void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2770 int stride, int bd) {
2771 tran_low_t out[32 * 32] = {0};
2772 tran_low_t *outptr = out;
2774 tran_low_t temp_in[32], temp_out[32];
2775 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2778 // Only upper-left 8x8 has non-zero coeff.
2779 for (i = 0; i < 8; ++i) {
2780 highbd_idct32(input, outptr, bd);
2785 for (i = 0; i < 32; ++i) {
2786 for (j = 0; j < 32; ++j)
2787 temp_in[j] = out[j * 32 + i];
2788 highbd_idct32(temp_in, temp_out, bd);
2789 for (j = 0; j < 32; ++j) {
2790 dest[j * stride + i] = highbd_clip_pixel_add(
2791 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2796 void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2797 int stride, int bd) {
2800 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2802 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2803 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2804 a1 = ROUND_POWER_OF_TWO(out, 6);
2806 for (j = 0; j < 32; ++j) {
2807 for (i = 0; i < 32; ++i)
2808 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2814 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2817 vp9_highbd_idct4x4_16_add(input, dest, stride, bd);
2819 vp9_highbd_idct4x4_1_add(input, dest, stride, bd);
2823 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2826 vp9_highbd_iwht4x4_16_add(input, dest, stride, bd);
2828 vp9_highbd_iwht4x4_1_add(input, dest, stride, bd);
2831 void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
2833 // If dc is 1, then input[0] is the reconstructed value, do not need
2834 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
2836 // The calculation can be simplified if there are not many non-zero dct
2837 // coefficients. Use eobs to decide what to do.
2838 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
2839 // Combine that with code here.
2840 // DC only DCT coefficient
2842 vp9_highbd_idct8x8_1_add(input, dest, stride, bd);
2843 } else if (eob <= 10) {
2844 vp9_highbd_idct8x8_10_add(input, dest, stride, bd);
2846 vp9_highbd_idct8x8_64_add(input, dest, stride, bd);
2850 void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
2851 int stride, int eob, int bd) {
2852 // The calculation can be simplified if there are not many non-zero dct
2853 // coefficients. Use eobs to separate different cases.
2854 // DC only DCT coefficient.
2856 vp9_highbd_idct16x16_1_add(input, dest, stride, bd);
2857 } else if (eob <= 10) {
2858 vp9_highbd_idct16x16_10_add(input, dest, stride, bd);
2860 vp9_highbd_idct16x16_256_add(input, dest, stride, bd);
2864 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
2865 int stride, int eob, int bd) {
2866 // Non-zero coeff only in upper-left 8x8
2868 vp9_highbd_idct32x32_1_add(input, dest, stride, bd);
2869 } else if (eob <= 34) {
2870 vp9_highbd_idct32x32_34_add(input, dest, stride, bd);
2872 vp9_highbd_idct32x32_1024_add(input, dest, stride, bd);
2877 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
2878 uint8_t *dest, int stride, int eob, int bd) {
2879 if (tx_type == DCT_DCT)
2880 vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
2882 vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
2885 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
2886 uint8_t *dest, int stride, int eob, int bd) {
2887 if (tx_type == DCT_DCT) {
2888 vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
2890 vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
2894 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
2895 uint8_t *dest, int stride, int eob, int bd) {
2896 if (tx_type == DCT_DCT) {
2897 vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
2899 vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
2902 #endif // CONFIG_VP9_HIGHBITDEPTH