2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #include "./vp10_rtcd.h"
15 #include "./vpx_config.h"
16 #include "./vpx_dsp_rtcd.h"
18 #include "vp10/common/vp9_blockd.h"
19 #include "vp10/common/vp9_idct.h"
20 #include "vp10/common/vp9_systemdependent.h"
21 #include "vpx_dsp/fwd_txfm.h"
22 #include "vpx_ports/mem.h"
24 static void fdct4(const tran_low_t *input, tran_low_t *output) {
26 tran_high_t temp1, temp2;
28 step[0] = input[0] + input[3];
29 step[1] = input[1] + input[2];
30 step[2] = input[1] - input[2];
31 step[3] = input[0] - input[3];
33 temp1 = (step[0] + step[1]) * cospi_16_64;
34 temp2 = (step[0] - step[1]) * cospi_16_64;
35 output[0] = (tran_low_t)fdct_round_shift(temp1);
36 output[2] = (tran_low_t)fdct_round_shift(temp2);
37 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
38 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
39 output[1] = (tran_low_t)fdct_round_shift(temp1);
40 output[3] = (tran_low_t)fdct_round_shift(temp2);
43 static void fdct8(const tran_low_t *input, tran_low_t *output) {
44 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
45 tran_high_t t0, t1, t2, t3; // needs32
46 tran_high_t x0, x1, x2, x3; // canbe16
49 s0 = input[0] + input[7];
50 s1 = input[1] + input[6];
51 s2 = input[2] + input[5];
52 s3 = input[3] + input[4];
53 s4 = input[3] - input[4];
54 s5 = input[2] - input[5];
55 s6 = input[1] - input[6];
56 s7 = input[0] - input[7];
63 t0 = (x0 + x1) * cospi_16_64;
64 t1 = (x0 - x1) * cospi_16_64;
65 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
66 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
67 output[0] = (tran_low_t)fdct_round_shift(t0);
68 output[2] = (tran_low_t)fdct_round_shift(t2);
69 output[4] = (tran_low_t)fdct_round_shift(t1);
70 output[6] = (tran_low_t)fdct_round_shift(t3);
73 t0 = (s6 - s5) * cospi_16_64;
74 t1 = (s6 + s5) * cospi_16_64;
75 t2 = (tran_low_t)fdct_round_shift(t0);
76 t3 = (tran_low_t)fdct_round_shift(t1);
85 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
86 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
87 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
88 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
89 output[1] = (tran_low_t)fdct_round_shift(t0);
90 output[3] = (tran_low_t)fdct_round_shift(t2);
91 output[5] = (tran_low_t)fdct_round_shift(t1);
92 output[7] = (tran_low_t)fdct_round_shift(t3);
95 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
96 tran_high_t step1[8]; // canbe16
97 tran_high_t step2[8]; // canbe16
98 tran_high_t step3[8]; // canbe16
99 tran_high_t input[8]; // canbe16
100 tran_high_t temp1, temp2; // needs32
103 input[0] = in[0] + in[15];
104 input[1] = in[1] + in[14];
105 input[2] = in[2] + in[13];
106 input[3] = in[3] + in[12];
107 input[4] = in[4] + in[11];
108 input[5] = in[5] + in[10];
109 input[6] = in[6] + in[ 9];
110 input[7] = in[7] + in[ 8];
112 step1[0] = in[7] - in[ 8];
113 step1[1] = in[6] - in[ 9];
114 step1[2] = in[5] - in[10];
115 step1[3] = in[4] - in[11];
116 step1[4] = in[3] - in[12];
117 step1[5] = in[2] - in[13];
118 step1[6] = in[1] - in[14];
119 step1[7] = in[0] - in[15];
121 // fdct8(step, step);
123 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
124 tran_high_t t0, t1, t2, t3; // needs32
125 tran_high_t x0, x1, x2, x3; // canbe16
128 s0 = input[0] + input[7];
129 s1 = input[1] + input[6];
130 s2 = input[2] + input[5];
131 s3 = input[3] + input[4];
132 s4 = input[3] - input[4];
133 s5 = input[2] - input[5];
134 s6 = input[1] - input[6];
135 s7 = input[0] - input[7];
137 // fdct4(step, step);
142 t0 = (x0 + x1) * cospi_16_64;
143 t1 = (x0 - x1) * cospi_16_64;
144 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
145 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
146 out[0] = (tran_low_t)fdct_round_shift(t0);
147 out[4] = (tran_low_t)fdct_round_shift(t2);
148 out[8] = (tran_low_t)fdct_round_shift(t1);
149 out[12] = (tran_low_t)fdct_round_shift(t3);
152 t0 = (s6 - s5) * cospi_16_64;
153 t1 = (s6 + s5) * cospi_16_64;
154 t2 = fdct_round_shift(t0);
155 t3 = fdct_round_shift(t1);
164 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
165 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
166 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
167 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
168 out[2] = (tran_low_t)fdct_round_shift(t0);
169 out[6] = (tran_low_t)fdct_round_shift(t2);
170 out[10] = (tran_low_t)fdct_round_shift(t1);
171 out[14] = (tran_low_t)fdct_round_shift(t3);
175 temp1 = (step1[5] - step1[2]) * cospi_16_64;
176 temp2 = (step1[4] - step1[3]) * cospi_16_64;
177 step2[2] = fdct_round_shift(temp1);
178 step2[3] = fdct_round_shift(temp2);
179 temp1 = (step1[4] + step1[3]) * cospi_16_64;
180 temp2 = (step1[5] + step1[2]) * cospi_16_64;
181 step2[4] = fdct_round_shift(temp1);
182 step2[5] = fdct_round_shift(temp2);
185 step3[0] = step1[0] + step2[3];
186 step3[1] = step1[1] + step2[2];
187 step3[2] = step1[1] - step2[2];
188 step3[3] = step1[0] - step2[3];
189 step3[4] = step1[7] - step2[4];
190 step3[5] = step1[6] - step2[5];
191 step3[6] = step1[6] + step2[5];
192 step3[7] = step1[7] + step2[4];
195 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
196 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
197 step2[1] = fdct_round_shift(temp1);
198 step2[2] = fdct_round_shift(temp2);
199 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
200 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
201 step2[5] = fdct_round_shift(temp1);
202 step2[6] = fdct_round_shift(temp2);
205 step1[0] = step3[0] + step2[1];
206 step1[1] = step3[0] - step2[1];
207 step1[2] = step3[3] + step2[2];
208 step1[3] = step3[3] - step2[2];
209 step1[4] = step3[4] - step2[5];
210 step1[5] = step3[4] + step2[5];
211 step1[6] = step3[7] - step2[6];
212 step1[7] = step3[7] + step2[6];
215 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
216 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
217 out[1] = (tran_low_t)fdct_round_shift(temp1);
218 out[9] = (tran_low_t)fdct_round_shift(temp2);
220 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
221 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
222 out[5] = (tran_low_t)fdct_round_shift(temp1);
223 out[13] = (tran_low_t)fdct_round_shift(temp2);
225 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
226 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
227 out[3] = (tran_low_t)fdct_round_shift(temp1);
228 out[11] = (tran_low_t)fdct_round_shift(temp2);
230 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
231 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
232 out[7] = (tran_low_t)fdct_round_shift(temp1);
233 out[15] = (tran_low_t)fdct_round_shift(temp2);
236 static void fadst4(const tran_low_t *input, tran_low_t *output) {
237 tran_high_t x0, x1, x2, x3;
238 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
245 if (!(x0 | x1 | x2 | x3)) {
246 output[0] = output[1] = output[2] = output[3] = 0;
269 // 1-D transform scaling factor is sqrt(2).
270 output[0] = (tran_low_t)fdct_round_shift(s0);
271 output[1] = (tran_low_t)fdct_round_shift(s1);
272 output[2] = (tran_low_t)fdct_round_shift(s2);
273 output[3] = (tran_low_t)fdct_round_shift(s3);
276 static void fadst8(const tran_low_t *input, tran_low_t *output) {
277 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
279 tran_high_t x0 = input[7];
280 tran_high_t x1 = input[0];
281 tran_high_t x2 = input[5];
282 tran_high_t x3 = input[2];
283 tran_high_t x4 = input[3];
284 tran_high_t x5 = input[4];
285 tran_high_t x6 = input[1];
286 tran_high_t x7 = input[6];
289 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
290 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
291 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
292 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
293 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
294 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
295 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
296 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
298 x0 = fdct_round_shift(s0 + s4);
299 x1 = fdct_round_shift(s1 + s5);
300 x2 = fdct_round_shift(s2 + s6);
301 x3 = fdct_round_shift(s3 + s7);
302 x4 = fdct_round_shift(s0 - s4);
303 x5 = fdct_round_shift(s1 - s5);
304 x6 = fdct_round_shift(s2 - s6);
305 x7 = fdct_round_shift(s3 - s7);
312 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
313 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
314 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7;
315 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
321 x4 = fdct_round_shift(s4 + s6);
322 x5 = fdct_round_shift(s5 + s7);
323 x6 = fdct_round_shift(s4 - s6);
324 x7 = fdct_round_shift(s5 - s7);
327 s2 = cospi_16_64 * (x2 + x3);
328 s3 = cospi_16_64 * (x2 - x3);
329 s6 = cospi_16_64 * (x6 + x7);
330 s7 = cospi_16_64 * (x6 - x7);
332 x2 = fdct_round_shift(s2);
333 x3 = fdct_round_shift(s3);
334 x6 = fdct_round_shift(s6);
335 x7 = fdct_round_shift(s7);
337 output[0] = (tran_low_t)x0;
338 output[1] = (tran_low_t)-x4;
339 output[2] = (tran_low_t)x6;
340 output[3] = (tran_low_t)-x2;
341 output[4] = (tran_low_t)x3;
342 output[5] = (tran_low_t)-x7;
343 output[6] = (tran_low_t)x5;
344 output[7] = (tran_low_t)-x1;
347 static void fadst16(const tran_low_t *input, tran_low_t *output) {
348 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
349 tran_high_t s9, s10, s11, s12, s13, s14, s15;
351 tran_high_t x0 = input[15];
352 tran_high_t x1 = input[0];
353 tran_high_t x2 = input[13];
354 tran_high_t x3 = input[2];
355 tran_high_t x4 = input[11];
356 tran_high_t x5 = input[4];
357 tran_high_t x6 = input[9];
358 tran_high_t x7 = input[6];
359 tran_high_t x8 = input[7];
360 tran_high_t x9 = input[8];
361 tran_high_t x10 = input[5];
362 tran_high_t x11 = input[10];
363 tran_high_t x12 = input[3];
364 tran_high_t x13 = input[12];
365 tran_high_t x14 = input[1];
366 tran_high_t x15 = input[14];
369 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
370 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
371 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
372 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
373 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
374 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
375 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
376 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
377 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
378 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
379 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
380 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
381 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
382 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
383 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
384 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
386 x0 = fdct_round_shift(s0 + s8);
387 x1 = fdct_round_shift(s1 + s9);
388 x2 = fdct_round_shift(s2 + s10);
389 x3 = fdct_round_shift(s3 + s11);
390 x4 = fdct_round_shift(s4 + s12);
391 x5 = fdct_round_shift(s5 + s13);
392 x6 = fdct_round_shift(s6 + s14);
393 x7 = fdct_round_shift(s7 + s15);
394 x8 = fdct_round_shift(s0 - s8);
395 x9 = fdct_round_shift(s1 - s9);
396 x10 = fdct_round_shift(s2 - s10);
397 x11 = fdct_round_shift(s3 - s11);
398 x12 = fdct_round_shift(s4 - s12);
399 x13 = fdct_round_shift(s5 - s13);
400 x14 = fdct_round_shift(s6 - s14);
401 x15 = fdct_round_shift(s7 - s15);
412 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
413 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
414 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
415 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
416 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
417 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
418 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
419 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
429 x8 = fdct_round_shift(s8 + s12);
430 x9 = fdct_round_shift(s9 + s13);
431 x10 = fdct_round_shift(s10 + s14);
432 x11 = fdct_round_shift(s11 + s15);
433 x12 = fdct_round_shift(s8 - s12);
434 x13 = fdct_round_shift(s9 - s13);
435 x14 = fdct_round_shift(s10 - s14);
436 x15 = fdct_round_shift(s11 - s15);
443 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
444 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
445 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
446 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
451 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
452 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
453 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
454 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
460 x4 = fdct_round_shift(s4 + s6);
461 x5 = fdct_round_shift(s5 + s7);
462 x6 = fdct_round_shift(s4 - s6);
463 x7 = fdct_round_shift(s5 - s7);
468 x12 = fdct_round_shift(s12 + s14);
469 x13 = fdct_round_shift(s13 + s15);
470 x14 = fdct_round_shift(s12 - s14);
471 x15 = fdct_round_shift(s13 - s15);
474 s2 = (- cospi_16_64) * (x2 + x3);
475 s3 = cospi_16_64 * (x2 - x3);
476 s6 = cospi_16_64 * (x6 + x7);
477 s7 = cospi_16_64 * (- x6 + x7);
478 s10 = cospi_16_64 * (x10 + x11);
479 s11 = cospi_16_64 * (- x10 + x11);
480 s14 = (- cospi_16_64) * (x14 + x15);
481 s15 = cospi_16_64 * (x14 - x15);
483 x2 = fdct_round_shift(s2);
484 x3 = fdct_round_shift(s3);
485 x6 = fdct_round_shift(s6);
486 x7 = fdct_round_shift(s7);
487 x10 = fdct_round_shift(s10);
488 x11 = fdct_round_shift(s11);
489 x14 = fdct_round_shift(s14);
490 x15 = fdct_round_shift(s15);
492 output[0] = (tran_low_t)x0;
493 output[1] = (tran_low_t)-x8;
494 output[2] = (tran_low_t)x12;
495 output[3] = (tran_low_t)-x4;
496 output[4] = (tran_low_t)x6;
497 output[5] = (tran_low_t)x14;
498 output[6] = (tran_low_t)x10;
499 output[7] = (tran_low_t)x2;
500 output[8] = (tran_low_t)x3;
501 output[9] = (tran_low_t)x11;
502 output[10] = (tran_low_t)x15;
503 output[11] = (tran_low_t)x7;
504 output[12] = (tran_low_t)x5;
505 output[13] = (tran_low_t)-x13;
506 output[14] = (tran_low_t)x9;
507 output[15] = (tran_low_t)-x1;
510 static const transform_2d FHT_4[] = {
511 { fdct4, fdct4 }, // DCT_DCT = 0
512 { fadst4, fdct4 }, // ADST_DCT = 1
513 { fdct4, fadst4 }, // DCT_ADST = 2
514 { fadst4, fadst4 } // ADST_ADST = 3
517 static const transform_2d FHT_8[] = {
518 { fdct8, fdct8 }, // DCT_DCT = 0
519 { fadst8, fdct8 }, // ADST_DCT = 1
520 { fdct8, fadst8 }, // DCT_ADST = 2
521 { fadst8, fadst8 } // ADST_ADST = 3
524 static const transform_2d FHT_16[] = {
525 { fdct16, fdct16 }, // DCT_DCT = 0
526 { fadst16, fdct16 }, // ADST_DCT = 1
527 { fdct16, fadst16 }, // DCT_ADST = 2
528 { fadst16, fadst16 } // ADST_ADST = 3
531 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
532 int stride, int tx_type) {
533 if (tx_type == DCT_DCT) {
534 vpx_fdct4x4_c(input, output, stride);
536 tran_low_t out[4 * 4];
538 tran_low_t temp_in[4], temp_out[4];
539 const transform_2d ht = FHT_4[tx_type];
542 for (i = 0; i < 4; ++i) {
543 for (j = 0; j < 4; ++j)
544 temp_in[j] = input[j * stride + i] * 16;
545 if (i == 0 && temp_in[0])
547 ht.cols(temp_in, temp_out);
548 for (j = 0; j < 4; ++j)
549 out[j * 4 + i] = temp_out[j];
553 for (i = 0; i < 4; ++i) {
554 for (j = 0; j < 4; ++j)
555 temp_in[j] = out[j + i * 4];
556 ht.rows(temp_in, temp_out);
557 for (j = 0; j < 4; ++j)
558 output[j + i * 4] = (temp_out[j] + 1) >> 2;
563 void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
564 tran_low_t *coeff_ptr, intptr_t n_coeffs,
566 const int16_t *zbin_ptr, const int16_t *round_ptr,
567 const int16_t *quant_ptr,
568 const int16_t *quant_shift_ptr,
569 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
570 const int16_t *dequant_ptr,
572 const int16_t *scan, const int16_t *iscan) {
576 tran_low_t intermediate[64];
580 tran_low_t *output = intermediate;
581 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
582 tran_high_t t0, t1, t2, t3; // needs32
583 tran_high_t x0, x1, x2, x3; // canbe16
586 for (i = 0; i < 8; i++) {
588 s0 = (input[0 * stride] + input[7 * stride]) * 4;
589 s1 = (input[1 * stride] + input[6 * stride]) * 4;
590 s2 = (input[2 * stride] + input[5 * stride]) * 4;
591 s3 = (input[3 * stride] + input[4 * stride]) * 4;
592 s4 = (input[3 * stride] - input[4 * stride]) * 4;
593 s5 = (input[2 * stride] - input[5 * stride]) * 4;
594 s6 = (input[1 * stride] - input[6 * stride]) * 4;
595 s7 = (input[0 * stride] - input[7 * stride]) * 4;
597 // fdct4(step, step);
602 t0 = (x0 + x1) * cospi_16_64;
603 t1 = (x0 - x1) * cospi_16_64;
604 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
605 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
606 output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
607 output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
608 output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
609 output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
612 t0 = (s6 - s5) * cospi_16_64;
613 t1 = (s6 + s5) * cospi_16_64;
614 t2 = fdct_round_shift(t0);
615 t3 = fdct_round_shift(t1);
624 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
625 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
626 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
627 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
628 output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
629 output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
630 output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
631 output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
638 for (i = 0; i < 8; ++i) {
639 fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
640 for (j = 0; j < 8; ++j)
641 coeff_ptr[j + i * 8] /= 2;
644 // TODO(jingning) Decide the need of these arguments after the
645 // quantization process is completed.
647 (void)quant_shift_ptr;
650 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
651 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
654 // Quantization pass: All coefficients with index >= zero_flag are
655 // skippable. Note: zero_flag can be zero.
656 for (i = 0; i < n_coeffs; i++) {
657 const int rc = scan[i];
658 const int coeff = coeff_ptr[rc];
659 const int coeff_sign = (coeff >> 31);
660 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
662 int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
663 tmp = (tmp * quant_ptr[rc != 0]) >> 16;
665 qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
666 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
675 void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
676 int stride, int tx_type) {
677 if (tx_type == DCT_DCT) {
678 vpx_fdct8x8_c(input, output, stride);
682 tran_low_t temp_in[8], temp_out[8];
683 const transform_2d ht = FHT_8[tx_type];
686 for (i = 0; i < 8; ++i) {
687 for (j = 0; j < 8; ++j)
688 temp_in[j] = input[j * stride + i] * 4;
689 ht.cols(temp_in, temp_out);
690 for (j = 0; j < 8; ++j)
691 out[j * 8 + i] = temp_out[j];
695 for (i = 0; i < 8; ++i) {
696 for (j = 0; j < 8; ++j)
697 temp_in[j] = out[j + i * 8];
698 ht.rows(temp_in, temp_out);
699 for (j = 0; j < 8; ++j)
700 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
705 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
707 void vp10_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
709 tran_high_t a1, b1, c1, d1, e1;
710 const int16_t *ip_pass0 = input;
711 const tran_low_t *ip = NULL;
712 tran_low_t *op = output;
714 for (i = 0; i < 4; i++) {
715 a1 = ip_pass0[0 * stride];
716 b1 = ip_pass0[1 * stride];
717 c1 = ip_pass0[2 * stride];
718 d1 = ip_pass0[3 * stride];
727 op[0] = (tran_low_t)a1;
728 op[4] = (tran_low_t)c1;
729 op[8] = (tran_low_t)d1;
730 op[12] = (tran_low_t)b1;
738 for (i = 0; i < 4; i++) {
751 op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
752 op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
753 op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
754 op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
761 void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
762 int stride, int tx_type) {
763 if (tx_type == DCT_DCT) {
764 vpx_fdct16x16_c(input, output, stride);
768 tran_low_t temp_in[16], temp_out[16];
769 const transform_2d ht = FHT_16[tx_type];
772 for (i = 0; i < 16; ++i) {
773 for (j = 0; j < 16; ++j)
774 temp_in[j] = input[j * stride + i] * 4;
775 ht.cols(temp_in, temp_out);
776 for (j = 0; j < 16; ++j)
777 out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
781 for (i = 0; i < 16; ++i) {
782 for (j = 0; j < 16; ++j)
783 temp_in[j] = out[j + i * 16];
784 ht.rows(temp_in, temp_out);
785 for (j = 0; j < 16; ++j)
786 output[j + i * 16] = temp_out[j];
791 #if CONFIG_VP9_HIGHBITDEPTH
792 void vp10_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
793 int stride, int tx_type) {
794 vp10_fht4x4_c(input, output, stride, tx_type);
797 void vp10_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
798 int stride, int tx_type) {
799 vp10_fht8x8_c(input, output, stride, tx_type);
802 void vp10_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
804 vp10_fwht4x4_c(input, output, stride);
807 void vp10_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
808 int stride, int tx_type) {
809 vp10_fht16x16_c(input, output, stride, tx_type);
811 #endif // CONFIG_VP9_HIGHBITDEPTH