extern "C" {
#include "vp9/common/vp9_entropy.h"
#include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
int16_t in[256], coeff[256];
- int16_t out_c[256];
+ uint8_t dst[256], src[256];
double out_r[256];
+ for (int j = 0; j < 256; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
- in[j] = rnd.Rand8() - rnd.Rand8();
+ in[j] = src[j] - dst[j];
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < 256; j++)
coeff[j] = round(out_r[j]);
- vp9_short_idct16x16_c(coeff, out_c, 32);
+ vp9_short_idct16x16_add_c(coeff, dst, 16);
for (int j = 0; j < 256; ++j) {
- const int diff = out_c[j] - in[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 16x16 IDCT has error " << error
}
}
}
-#if 1
+
// we need enable fdct test once we re-do the 16 point fdct.
TEST(VP9Fdct16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[256];
int16_t test_temp_block[256];
- int16_t test_output_block[256];
+ uint8_t dst[256], src[256];
+ for (int j = 0; j < 256; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 32;
vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
for (int j = 0; j < 256; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
}
}
}
-#endif
-
} // namespace
output[15] = step2[0] - step2[15];
}
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[16], temp_out[16];
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
- int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+ int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
- }
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]); }
}
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
- int16_t out[16 * 16];
- int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
- int i, j;
- int16_t temp_in[16], temp_out[16];
-
- /* First transform rows. Since all non-zero dct coefficients are in
- * upper-left 4x4 area, we only need to calculate first 4 rows here.
- */
- vpx_memset(out, 0, sizeof(out));
- for (i = 0; i < 4; ++i) {
- idct16_1d(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
- idct16_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
- }
-}
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t out[16 * 16];
+ int16_t *outptr = out;
+ int i, j;
+ int16_t temp_in[16], temp_out[16];
+
+ /* First transform rows. Since all non-zero dct coefficients are in
+ * upper-left 4x4 area, we only need to calculate first 4 rows here.
+ */
+ vpx_memset(out, 0, sizeof(out));
+ for (i = 0; i < 4; ++i) {
+ idct16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ idct16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
+ }
+}
void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_8x8 sse2
-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
specialize vp9_idct4_1d sse2
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ in_x = _mm_add_epi16(in_x, d0); \
+ in_x = _mm_packus_epi16(in_x, in_x); \
+ _mm_storel_epi64((__m128i *)(dest), in_x); \
+ dest += stride; \
+ }
+
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-
- output += 8;
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+
+ dest += 8 - (stride * 16);
}
}
}
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
-
// 1-D idct. Load input data.
in0 = _mm_load_si128((__m128i *)input);
in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
- output += 8;
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+
+ dest += 8 - (stride * 16);
}
}
in30 = _mm_srai_epi16(in30, 6);
in31 = _mm_srai_epi16(in31, 6);
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
- dest += stride; \
- }
-
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
add_residual(diff, dest, stride, 8, 8);
}
-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 16, 16);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
if (tx_type == DCT_DCT) {
vp9_idct_add_16x16(input, dest, stride, eob);
} else {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
if (eob > 0) {
- vp9_short_iht16x16(input, output, 16, tx_type);
+ vp9_short_iht16x16_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 512);
- vp9_add_residual_16x16(output, dest, stride);
}
}
}
void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob) {
vp9_add_constant_residual_16x16(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct10_16x16(input, output, 32);
-
+ vp9_short_idct10_16x16_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;
input[32] = input[33] = 0;
input[48] = 0;
-
- vp9_add_residual_16x16(output, dest, stride);
#endif
} else {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct16x16(input, output, 16 << 1);
+ vp9_short_idct16x16_add(input, dest, stride);
vpx_memset(input, 0, 512);
- vp9_add_residual_16x16(output, dest, stride);
}
}
}
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
- int stride) {
- const int width = 16;
- int i = 4;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
- do {
- d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
- d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
- d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
- d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
- d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
- d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
- // Prediction data.
- p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
- p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
- p0 = _mm_unpacklo_epi8(p1, zero);
- p1 = _mm_unpackhi_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p3, zero);
- p3 = _mm_unpackhi_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p5, zero);
- p5 = _mm_unpackhi_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p7, zero);
- p7 = _mm_unpackhi_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p1 = _mm_packus_epi16(p2, p3);
- p2 = _mm_packus_epi16(p4, p5);
- p3 = _mm_packus_epi16(p6, p7);
-
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
- _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
- diff += 4 * width;
- dest += 4 * stride;
- } while (--i);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
- vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
} else {
- vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw, tx_type);
+ vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride,
+ tx_type);
}
*wip_txfrm_size = 16;
break;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 32)
+ if (wip_txfrm_size < 16)
vp9_recon_sbuv(xd, bsize);
}
// wip version... will use foreach_transformed_block when done
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
- if (wip_txfrm_size < 32)
+ if (wip_txfrm_size < 16)
vp9_recon_sby(xd, bsize);
wip_txfrm_size = 0;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 32)
+ if (wip_txfrm_size < 16)
vp9_recon_sbuv(xd, bsize);
#endif
}