*.a
*.asm.s
*.d
+*.gcno
+*.gcda
*.o
*~
/*.ivf
$(call enabled,LIBVPX_TEST_DATA))
libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_srcs.txt:
+ @echo " [CREATE] $@"
+ @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_test_srcs.txt
+
$(LIBVPX_TEST_DATA):
@echo " [DOWNLOAD] $@"
$(qexec)trap 'rm -f $@' INT TERM &&\
altref_count_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
SetMode(GET_PARAM(1));
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
if ( video->frame() == 1) {
const libvpx_test::VP8CodecFactory kVP8;
-#define VP8_INSTANTIATE_TEST_CASE(test, params)\
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
INSTANTIATE_TEST_CASE_P(VP8, test, \
::testing::Combine( \
::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
&libvpx_test::kVP8)), \
- params))
+ __VA_ARGS__))
#else
-#define VP8_INSTANTIATE_TEST_CASE(test, params)
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)
#endif // CONFIG_VP8
const libvpx_test::VP9CodecFactory kVP9;
-#define VP9_INSTANTIATE_TEST_CASE(test, params)\
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
INSTANTIATE_TEST_CASE_P(VP9, test, \
::testing::Combine( \
::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
&libvpx_test::kVP9)), \
- params))
+ __VA_ARGS__))
#else
-#define VP9_INSTANTIATE_TEST_CASE(test, params)
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)
#endif // CONFIG_VP9
++frame_count_out_;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
unsigned int frame_count_in_;
unsigned int frame_count_out_;
unsigned int frame_count_max_;
--- /dev/null
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class CpuSpeedTest : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWith2Params<
+ libvpx_test::TestMode, int> {
+ protected:
+ CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+ encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+ }
+ }
+
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+ if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+ }
+ }
+ int set_cpu_used_;
+};
+
+TEST_P(CpuSpeedTest, TestQ0) {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 400;
+ cfg_.rc_max_quantizer = 0;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 20);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 12000;
+ cfg_.rc_max_quantizer = 10;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 40);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(CpuSpeedTest, TestLowBitrate) {
+ // Validate that this clip encodes and decodes without a mismatch
+ // when passing in a very high min q. This pushes the encoder to producing
+ // lots of small partitions which might will test the other condition.
+
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.rc_min_quantizer = 40;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 40);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+using std::tr1::make_tuple;
+
+#define VP9_FACTORY \
+ static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
+
+VP9_INSTANTIATE_TEST_CASE(
+ CpuSpeedTest,
+ ::testing::Values(::libvpx_test::kTwoPassGood),
+ ::testing::Range(0, 3));
+} // namespace
n_frames_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
duration_ = 0.0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
const vpx_rational_t tb = video->timebase();
virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
// Hook to determine whether the encode loop should continue.
- virtual bool Continue() const { return !abort_; }
+ virtual bool Continue() const {
+ return !(::testing::Test::HasFatalFailure() || abort_);
+ }
const CodecFactory *codec_;
// Hook to determine whether to decode frame after encoding
mismatch_nframes_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
#include "acm_random.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
using libvpx_test::ACMRandom;
namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int /*tx_type*/) {
vp9_short_fdct4x4_c(in, out, stride);
}
-void idct4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
- int stride, int tx_type) {
+void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int /*tx_type*/) {
vp9_short_idct4x4_add_c(out, dst, stride >> 1);
}
-void fht4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int tx_type) {
vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
}
-void iht4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
+void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
}
class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
public:
- FwdTrans4x4Test() {SetUpTestTxfm();}
+ FwdTrans4x4Test() { SetUpTestTxfm(); }
~FwdTrans4x4Test() {}
void SetUpTestTxfm() {
- tx_type = GetParam();
- if (tx_type == 0) {
- fwd_txfm = fdct4x4;
- inv_txfm = idct4x4_add;
+ tx_type_ = GetParam();
+ if (tx_type_ == 0) {
+ fwd_txfm_ = fdct4x4;
+ inv_txfm_ = idct4x4_add;
} else {
- fwd_txfm = fht4x4;
- inv_txfm = iht4x4_add;
+ fwd_txfm_ = fht4x4;
+ inv_txfm_ = iht4x4_add;
}
}
protected:
void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
- (*fwd_txfm)(in, out, dst, stride, tx_type);
+ (*fwd_txfm_)(in, out, dst, stride, tx_type);
}
void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
- (*inv_txfm)(in, out, dst, stride, tx_type);
+ (*inv_txfm_)(in, out, dst, stride, tx_type);
}
- int tx_type;
- void (*fwd_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
+ int tx_type_;
+ void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type);
- void (*inv_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
+ void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type);
};
TEST_P(FwdTrans4x4Test, SignBiasCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int16_t test_input_block[16];
- int16_t test_output_block[16];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
const int pitch = 8;
int count_sign_block[16][2];
const int count_test_block = 1000000;
for (int j = 0; j < 16; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if (test_output_block[j] < 0)
EXPECT_TRUE(bias_acceptable)
<< "Error: 4x4 FDCT/FHT has a sign bias > 1%"
<< " for input range [-255, 255] at index " << j
- << " tx_type " << tx_type;
+ << " tx_type " << tx_type_;
}
memset(count_sign_block, 0, sizeof(count_sign_block));
for (int j = 0; j < 16; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if (test_output_block[j] < 0)
double total_error = 0;
const int count_test_block = 1000000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[16];
- int16_t test_temp_block[16];
- uint8_t dst[16], src[16];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
for (int j = 0; j < 16; ++j) {
src[j] = rnd.Rand8();
test_input_block[j] = src[j] - dst[j];
const int pitch = 8;
- RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if(test_temp_block[j] > 0) {
}
// inverse transform and reconstruct the pixel block
- RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
const int diff = dst[j] - src[j];
EXPECT_GE(count_test_block, total_error)
<< "Error: FDCT/IDCT or FHT/IHT has average "
- "roundtrip error > 1 per block";
+ << "roundtrip error > 1 per block";
}
INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
extern "C" {
#include "vp9_rtcd.h"
using libvpx_test::ACMRandom;
namespace {
+void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int /*tx_type*/) {
+ vp9_short_fdct8x8_c(in, out, stride);
+}
+void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int /*tx_type*/) {
+ vp9_short_idct8x8_add_c(out, dst, stride >> 1);
+}
+void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int tx_type) {
+ // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
+ // when we have all inverse dct functions done sse2.
+#if HAVE_SSE2
+ vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
+#else
+ vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
+ public:
+ FwdTrans8x8Test() { SetUpTestTxfm(); }
+ ~FwdTrans8x8Test() {}
+
+ void SetUpTestTxfm() {
+ tx_type_ = GetParam();
+ if (tx_type_ == 0) {
+ fwd_txfm = fdct8x8;
+ inv_txfm = idct8x8_add;
+ } else {
+ fwd_txfm = fht8x8;
+ inv_txfm = iht8x8_add;
+ }
+ }
+
+ protected:
+ void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*fwd_txfm)(in, out, dst, stride, tx_type);
+ }
+ void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*inv_txfm)(in, out, dst, stride, tx_type);
+ }
+
+ int tx_type_;
+ void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+ void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
-TEST(VP9Fdct8x8Test, SignBiasCheck) {
+TEST_P(FwdTrans8x8Test, SignBiasCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int16_t test_input_block[64];
- int16_t test_output_block[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
const int pitch = 16;
int count_sign_block[64][2];
const int count_test_block = 100000;
for (int j = 0; j < 64; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
- vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 1125;
EXPECT_LT(diff, max_diff)
- << "Error: 8x8 FDCT has a sign bias > "
+ << "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-255, 255] at index " << j
<< " count0: " << count_sign_block[j][0]
for (int j = 0; j < 64; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
- vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 10000;
EXPECT_LT(diff, max_diff)
- << "Error: 4x4 FDCT has a sign bias > "
+ << "Error: 4x4 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-15, 15] at index " << j
<< " count0: " << count_sign_block[j][0]
<< " count1: " << count_sign_block[j][1]
<< " diff: " << diff;
}
-};
+}
-TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
double total_error = 0;
const int count_test_block = 100000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[64];
- int16_t test_temp_block[64];
- uint8_t dst[64], src[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8();
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j){
if(test_temp_block[j] > 0) {
test_temp_block[j] += 2;
test_temp_block[j] *= 4;
}
}
- vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
}
EXPECT_GE(1, max_error)
- << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";
+ << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
EXPECT_GE(count_test_block/5, total_error)
- << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
-};
+ << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+ "error > 1/5 per block";
+}
-TEST(VP9Fdct8x8Test, ExtremalCheck) {
+TEST_P(FwdTrans8x8Test, ExtremalCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
double total_error = 0;
const int count_test_block = 100000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[64];
- int16_t test_temp_block[64];
- uint8_t dst[64], src[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8() % 2 ? 255 : 0;
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
}
EXPECT_GE(1, max_error)
- << "Error: Extremal 8x8 FDCT/IDCT has an"
+ << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
<< " individual roundtrip error > 1";
EXPECT_GE(count_test_block/5, total_error)
- << "Error: Extremal 8x8 FDCT/IDCT has average"
+ << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
<< " roundtrip error > 1/5 per block";
}
-};
+}
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
} // namespace
if (input_file_)
fclose(input_file_);
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
if (start_) {
fseek(input_file_, raw_sz_ * start_, SEEK_SET);
}
virtual void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
// Read a frame from input_file.
if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
limit_ = frame_;
unsigned int frame_;
unsigned int width_;
unsigned int height_;
- unsigned int framerate_numerator_;
- unsigned int framerate_denominator_;
+ int framerate_numerator_;
+ int framerate_denominator_;
};
} // namespace libvpx_test
class IntraPredBase {
public:
+ virtual ~IntraPredBase() {}
+
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
virtual void Begin() {
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
// Read file header
}
void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
uint8_t frame_hdr[kIvfFrameHdrSize];
// Check frame header and read a frame from input_file.
if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
set_cpu_used_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
if (kf_do_force_kf_)
SetMode(GET_PARAM(1));
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void DecompressedFrameHook(const vpx_image_t &img,
vpx_codec_pts_t pts) {
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
#endif
#if CONFIG_VP9_ENCODER
const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
#endif
#endif
#if CONFIG_VP9_ENCODER
make_tuple(64, 64, sad_64x64_sse2_vp9),
+ make_tuple(64, 32, sad_64x32_sse2_vp9),
+ make_tuple(32, 64, sad_32x64_sse2_vp9),
make_tuple(32, 32, sad_32x32_sse2_vp9),
+ make_tuple(32, 16, sad_32x16_sse2_vp9),
+ make_tuple(16, 32, sad_16x32_sse2_vp9),
make_tuple(16, 16, sad_16x16_sse2_vp9),
- make_tuple(8, 16, sad_8x16_sse2_vp9),
make_tuple(16, 8, sad_16x8_sse2_vp9),
+ make_tuple(8, 16, sad_8x16_sse2_vp9),
make_tuple(8, 8, sad_8x8_sse2_vp9),
make_tuple(8, 4, sad_8x4_sse2_vp9),
#endif
delete[] modified_buf_;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../md5_utils.h ../md5_utils.c
LIBVPX_TEST_SRCS-yes += decode_test_driver.cc
virtual void DecompressedFrameHook(const vpx_image_t& img,
const unsigned int frame_number) {
+ ASSERT_TRUE(md5_file_ != NULL);
char expected_md5[33];
char junk[128];
vpx_free(src_);
delete[] ref_;
vpx_free(sec_);
+ libvpx_test::ClearSystemState();
}
protected:
virtual void Begin() {
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
}
void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
if (chunk_ >= chunks_) {
unsigned int track;
--- /dev/null
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_loop_filter_horizontal_edge_neon|
+ EXPORT |vp9_loop_filter_vertical_edge_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_loop_filter_horizontal_edge_neon| PROC
+ push {r4-r6, lr}
+
+ ldr r12, [sp,#20] ; load count
+ ldrb r4, [r2] ; load *blimit
+ ldrb r5, [r3] ; load *limit
+ cmp r12, #0
+ beq end_vp9_lf_h_edge
+
+ ldr r3, [sp, #16] ; load thresh
+ vdup.u8 d0, r4 ; duplicate blimit
+ ldrb r6, [r3] ; load *thresh
+ vdup.u8 d1, r5 ; duplicate limit
+ vdup.u8 d2, r6 ; duplicate thresh
+
+count_lf_h_loop
+ sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
+ add r6, r2, r1
+ add r1, r1, r1
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r6@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r6@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r6@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r6@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r6, r6, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r6@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r6@64], r1 ; store oq1
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_lf_h_loop
+
+end_vp9_lf_h_edge
+ pop {r4-r6, pc}
+ ENDP ; |vp9_loop_filter_horizontal_edge_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_loop_filter_vertical_edge_neon| PROC
+ push {r4-r6, lr}
+
+ ldr r12, [sp,#20] ; load count
+ ldrb r4, [r2] ; load *blimit
+ ldrb r5, [r3] ; load *limit
+ cmp r12, #0
+ beq end_vp9_lf_v_edge
+
+ ldr r3, [sp, #16] ; load thresh
+ vdup.u8 d0, r4 ; duplicate blimit
+ ldrb r6, [r3] ; load *thresh
+ vdup.u8 d1, r5 ; duplicate limit
+ vdup.u8 d2, r6 ; duplicate thresh
+
+count_lf_v_loop
+ sub r6, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.u8 {d3}, [r6], r1 ; load s data
+ vld1.u8 {d4}, [r6], r1
+ vld1.u8 {d5}, [r6], r1
+ vld1.u8 {d6}, [r6], r1
+ vld1.u8 {d7}, [r6], r1
+ vld1.u8 {d16}, [r6], r1
+ vld1.u8 {d17}, [r6], r1
+ vld1.u8 {d18}, [r6]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ bne count_lf_v_loop
+
+end_vp9_lf_v_edge
+ pop {r4-r6, pc}
+ ENDP ; |vp9_loop_filter_vertical_edge_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; r0-r3 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+|vp9_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20
+ vmax.u8 d20, d21, d22
+ vmax.u8 d3, d3, d4
+ vmax.u8 d23, d19, d20
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3
+
+ vmov.u8 d18, #0x80
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d23, d1, d23
+
+ ; filter() function
+ ; convert to signed
+ veor d7, d7, d18 ; qs0
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; (a > blimit * 2 + limit) * -1
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter >> 1
+ vrshr.s8 d27, d27, #1
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d6, d26, d18 ; *oq0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon|
+
+ END
#include "./vpx_config.h"
#include "vpx_mem/vpx_mem.h"
+
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_entropymv.h"
vpx_free(oci->above_context[0]);
for (i = 0; i < MAX_MB_PLANE; i++)
oci->above_context[i] = 0;
- oci->mip = 0;
- oci->prev_mip = 0;
- oci->above_seg_context = 0;
+ oci->mip = NULL;
+ oci->prev_mip = NULL;
+ oci->above_seg_context = NULL;
}
static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
- cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;
+ cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
}
static void setup_mi(VP9_COMMON *cm) {
int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
int i, mi_cols;
- // Our internal buffers are always multiples of 16
- const int aligned_width = multiple8(width);
- const int aligned_height = multiple8(height);
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
const int ss_x = oci->subsampling_x;
const int ss_y = oci->subsampling_y;
+ int mi_size;
vp9_free_frame_buffers(oci);
set_mb_mi(oci, aligned_width, aligned_height);
// Allocation
- oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),
- sizeof(MODE_INFO));
+ mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+
+ oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
if (!oci->mip)
goto fail;
- oci->prev_mip = vpx_calloc(oci->mode_info_stride *
- (oci->mi_rows + 64 / MI_SIZE),
- sizeof(MODE_INFO));
+ oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
if (!oci->prev_mip)
goto fail;
// FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
// information is exposed at this level
- mi_cols = mi_cols_aligned_to_sb(oci);
+ mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
// 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
// block where mi unit size is 8x8.
}
void vp9_update_frame_size(VP9_COMMON *cm) {
- const int aligned_width = multiple8(cm->width);
- const int aligned_height = multiple8(cm->height);
+ const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
+ const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
set_mb_mi(cm, aligned_width, aligned_height);
setup_mi(cm);
#define WHT_UPSCALE_FACTOR 2
-#define TX_SIZE_PROBS 6 // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
-
-#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
- (c)->fc.tx_probs_8x8p : \
- (b) < BLOCK_SIZE_SB32X32 ? \
- (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)
-
/* For keyframes, intra block modes are predicted by the (already decoded)
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
union b_mode_info {
- struct {
- MB_PREDICTION_MODE first;
- } as_mode;
+ MB_PREDICTION_MODE as_mode;
int_mv as_mv[2]; // first, second inter predictor motion vectors
};
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
int_mv best_mv, best_second_mv;
- int mb_mode_context[MAX_REF_FRAMES];
+ uint8_t mb_mode_context[MAX_REF_FRAMES];
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char segment_id; // Segment id for current frame
union b_mode_info bmi[4];
} MODE_INFO;
+enum mv_precision {
+ MV_PRECISION_Q3,
+ MV_PRECISION_Q4
+};
+
#define VP9_REF_SCALE_SHIFT 14
struct scale_factors {
int x_scale_fp; // horizontal fixed point scale factor
int (*scale_value_x)(int val, const struct scale_factors *scale);
int (*scale_value_y)(int val, const struct scale_factors *scale);
void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
- int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,
- const struct scale_factors *scale);
- int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);
+ MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
+ MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
convolve_fn_t predict[2][2][2]; // horiz, vert, avg
};
signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
/* 0 = Intra, Last, GF, ARF */
signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
+
/* 0 = ZERO_MV, MV */
signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
/* 0 = ZERO_MV, MV */
static INLINE void update_partition_context(MACROBLOCKD *xd,
BLOCK_SIZE_TYPE sb_type,
BLOCK_SIZE_TYPE sb_size) {
- int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
- int bwl = b_width_log2(sb_type);
- int bhl = b_height_log2(sb_type);
- int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
- int i;
+ const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+ const int bwl = b_width_log2(sb_type);
+ const int bhl = b_height_log2(sb_type);
+ const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+ const char pcval0 = ~(0xe << boffset);
+ const char pcval1 = ~(0xf << boffset);
+ const char pcvalue[2] = {pcval0, pcval1};
+
+ assert(MAX(bwl, bhl) <= bsl);
// update the partition context at the end notes. set partition bits
// of block sizes larger than the current one to be one, and partition
// bits of smaller block sizes to be zero.
- if ((bwl == bsl) && (bhl == bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xf << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xf << boffset);
- } else if ((bwl == bsl) && (bhl < bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xe << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xf << boffset);
- } else if ((bwl < bsl) && (bhl == bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xf << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xe << boffset);
- } else if ((bwl < bsl) && (bhl < bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xe << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xe << boffset);
- } else {
- assert(0);
- }
+ vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
+ vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
}
static INLINE int partition_plane_context(MACROBLOCKD *xd,
return subsize;
}
-// transform mapping
-static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
- switch (bmode) {
- case TM_PRED :
- case D135_PRED :
- return ADST_ADST;
+extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
- case V_PRED :
- case D117_PRED :
- case D63_PRED:
- return ADST_DCT;
+static INLINE TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
+ MODE_INFO *const mi = xd->mode_info_context;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
- case H_PRED :
- case D153_PRED :
- case D27_PRED :
- return DCT_ADST;
+ if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
+ return DCT_DCT;
- default:
- return DCT_DCT;
- }
+ return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+ mi->bmi[ib].as_mode : mbmi->mode];
}
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type;
- MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
- if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
- return DCT_DCT;
- if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
- tx_type = txfm_map(mi->bmi[ib].as_mode.first);
- } else {
- assert(mbmi->mode <= TM_PRED);
- tx_type = txfm_map(mbmi->mode);
- }
- return tx_type;
+static INLINE TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd) {
+ return mode2txfm_map[xd->mode_info_context->mbmi.mode];
}
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
- tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
- }
- return tx_type;
+static INLINE TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd) {
+ return mode2txfm_map[xd->mode_info_context->mbmi.mode];
}
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
- tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+ xd->plane[i].subsampling_x = i ? ss_x : 0;
+ xd->plane[i].subsampling_y = i ? ss_y : 0;
}
- return tx_type;
+#if CONFIG_ALPHA
+ // TODO(jkoleszar): Using the Y w/h for now
+ mb->plane[3].subsampling_x = 0;
+ mb->plane[3].subsampling_y = 0;
+#endif
}
-void vp9_setup_block_dptrs(MACROBLOCKD *xd,
- int subsampling_x, int subsampling_y);
static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
const TX_SIZE size = mbmi->txfm_size;
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+#define ROUND_POWER_OF_TWO(value, n) \
+ (((value) + (1 << ((n) - 1))) >> (n))
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
- return (value + (1 << (n - 1))) >> n;
-}*/
+#define ALIGN_POWER_OF_TWO(value, n) \
+ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
// Only need this for fixed-size arrays, for structs just assign.
#define vp9_copy(dest, src) { \
return value < low ? low : (value > high ? high : value);
}
-static INLINE int multiple8(int value) {
- return (value + 7) & ~7;
+static int get_unsigned_bits(unsigned int num_values) {
+ int cat = 0;
+ if (num_values <= 1)
+ return 0;
+ num_values--;
+ while (num_values > 0) {
+ cat++;
+ num_values >>= 1;
+ }
+ return cat;
}
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+ lval = (expr); \
+ if (!lval) \
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__, __LINE__); \
+ } while (0)
+#else
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+ lval = (expr); \
+ if (!lval) \
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate "#lval); \
+ } while (0)
+#endif
+
#define SYNC_CODE_0 0x49
#define SYNC_CODE_1 0x83
#define SYNC_CODE_2 0x42
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
0, 4, 1, 5,
8, 2, 12, 9,
3, 6, 13, 10,
7, 14, 11, 15,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
0, 4, 8, 1,
12, 5, 9, 2,
13, 6, 10, 3,
7, 14, 11, 15,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
0, 1, 4, 2,
5, 3, 6, 8,
9, 7, 12, 10,
13, 11, 14, 15,
};
-DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
0, 8, 1, 16, 9, 2, 17, 24,
10, 3, 18, 25, 32, 11, 4, 26,
33, 19, 40, 12, 34, 27, 5, 41,
46, 39, 61, 54, 47, 62, 55, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
0, 8, 16, 1, 24, 9, 32, 17,
2, 40, 25, 10, 33, 18, 48, 3,
26, 41, 11, 56, 19, 34, 4, 49,
31, 61, 39, 54, 47, 62, 55, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
0, 1, 2, 8, 9, 3, 16, 10,
4, 17, 11, 24, 5, 18, 25, 12,
19, 26, 32, 6, 13, 20, 33, 27,
60, 39, 61, 47, 54, 55, 62, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100,
225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197,
71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136,
extend_model_to_full_distribution(model[PIVOT_NODE], full);
}
-void vp9_model_to_full_probs_sb(
- vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
- vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
- int c, p;
- for (c = 0; c < COEF_BANDS; ++c)
- for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
- vp9_model_to_full_probs(model[c][p], full[c][p]);
- }
-}
-
static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
static void init_bit_tree(vp9_tree_index *p, int n) {
#include "vp9/common/vp9_default_coef_probs.h"
-// This function updates and then returns n AC coefficient context
-// This is currently a placeholder function to allow experimentation
-// using various context models based on the energy earlier tokens
-// within the current block.
-//
-// For now it just returns the previously used context.
-#define MAX_NEIGHBORS 2
-int vp9_get_coef_context(const int *scan, const int *neighbors,
- int nb_pad, uint8_t *token_cache, int c, int l) {
- int eob = l;
- assert(nb_pad == MAX_NEIGHBORS);
- if (c == eob) {
- return 0;
- } else {
- int ctx;
- assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
- if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
- ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
- token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
- } else {
- ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
- }
- return ctx;
- }
-};
-
void vp9_default_coef_probs(VP9_COMMON *pc) {
vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
sizeof(pc->fc.coef_probs[TX_4X4]));
// in {top, left, topleft, topright, bottomleft} order
// for each position in raster scan order.
// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
-
-static int find_in_scan(const int *scan, int l, int idx) {
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
int n, l2 = l * l;
for (n = 0; n < l2; n++) {
int rc = scan[n];
assert(0);
return -1;
}
-static void init_scan_neighbors(const int *scan, int l, int *neighbors,
- int max_neighbors) {
+static void init_scan_neighbors(const int16_t *scan,
+ int16_t *iscan,
+ int l, int16_t *neighbors) {
int l2 = l * l;
int n, i, j;
- for (n = 0; n < l2; n++) {
+ // dc doesn't use this type of prediction
+ neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+ iscan[0] = find_in_scan(scan, l, 0);
+ for (n = 1; n < l2; n++) {
int rc = scan[n];
- assert(max_neighbors == MAX_NEIGHBORS);
+ iscan[n] = find_in_scan(scan, l, n);
i = rc / l;
j = rc % l;
if (i > 0 && j > 0) {
// Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
// as a context. If ADST or DCT is used in both directions, we
// use the combination of the two as a context.
- int a = find_in_scan(scan, l, (i - 1) * l + j);
- int b = find_in_scan(scan, l, i * l + j - 1);
+ int a = (i - 1) * l + j;
+ int b = i * l + j - 1;
if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
scan == vp9_col_scan_16x16) {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = -1;
+ // in the col/row scan cases (as well as left/top edge cases), we set
+ // both contexts to the same value, so we can branchlessly do a+b+1>>1
+ // which automatically becomes a if a == b
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = a;
} else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
scan == vp9_row_scan_16x16) {
- neighbors[max_neighbors * n + 0] = b;
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
} else {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = b;
+ neighbors[MAX_NEIGHBORS * n + 0] = a;
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
}
} else if (i > 0) {
- neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
- neighbors[max_neighbors * n + 1] = -1;
- } else if (j > 0) {
- neighbors[max_neighbors * n + 0] =
- find_in_scan(scan, l, i * l + j - 1);
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
} else {
- assert(n == 0);
- // dc predictor doesn't use previous tokens
- neighbors[max_neighbors * n + 0] = -1;
+ assert(j > 0);
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
}
- assert(neighbors[max_neighbors * n + 0] < n);
+ assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
}
+ // one padding item so we don't have to add branches in code to handle
+ // calls to get_coef_context() for the token after the final dc token
+ neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
}
void vp9_init_neighbors() {
- init_scan_neighbors(vp9_default_scan_4x4, 4,
- vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_4x4, 4,
- vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_4x4, 4,
- vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_8x8, 8,
- vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_8x8, 8,
- vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_8x8, 8,
- vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_16x16, 16,
- vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_16x16, 16,
- vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_16x16, 16,
- vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_32x32, 32,
- vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
+ init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+ vp9_default_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+ vp9_row_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+ vp9_col_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+ vp9_default_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+ vp9_row_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+ vp9_col_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+ vp9_default_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+ vp9_row_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+ vp9_col_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+ vp9_default_scan_32x32_neighbors);
}
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
if (scan == vp9_default_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_4x4_neighbors;
} else if (scan == vp9_row_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_row_scan_4x4_neighbors;
} else if (scan == vp9_col_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_col_scan_4x4_neighbors;
} else if (scan == vp9_default_scan_8x8) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_8x8_neighbors;
} else if (scan == vp9_row_scan_8x8) {
- *pad = 2;
return vp9_row_scan_8x8_neighbors;
} else if (scan == vp9_col_scan_8x8) {
- *pad = 2;
return vp9_col_scan_8x8_neighbors;
} else if (scan == vp9_default_scan_16x16) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_16x16_neighbors;
} else if (scan == vp9_row_scan_16x16) {
- *pad = 2;
return vp9_row_scan_16x16_neighbors;
} else if (scan == vp9_col_scan_16x16) {
- *pad = 2;
return vp9_col_scan_16x16_neighbors;
- } else if (scan == vp9_default_scan_32x32) {
- *pad = MAX_NEIGHBORS;
- return vp9_default_scan_32x32_neighbors;
} else {
- assert(0);
- return NULL;
+ assert(scan == vp9_default_scan_32x32);
+ return vp9_default_scan_32x32_neighbors;
}
}
#define COEF_COUNT_SAT_AFTER_KEY 24
#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-void vp9_full_to_model_count(unsigned int *model_count,
- unsigned int *full_count) {
- int n;
- model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
- model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
- model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
- for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
- model_count[TWO_TOKEN] += full_count[n];
- model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
-}
-
-void vp9_full_to_model_counts(
- vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
- int i, j, k, l;
- for (i = 0; i < BLOCK_TYPES; ++i)
- for (j = 0; j < REF_TYPES; ++j)
- for (k = 0; k < COEF_BANDS; ++k)
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- if (l >= 3 && k == 0)
- continue;
- vp9_full_to_model_count(model_count[i][j][k][l],
- full_count[i][j][k][l]);
- }
-}
-
static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
int count_sat, int update_factor) {
vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
void vp9_coef_tree_initialize(void);
void vp9_adapt_coef_probs(struct VP9Common *);
? (COEF_BANDS-1) : band_translate[coef_index];
}
-extern int vp9_get_coef_context(const int *scan, const int *neighbors,
- int nb_pad, uint8_t *token_cache, int c, int l);
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
+#define MAX_NEIGHBORS 2
+static INLINE int get_coef_context(const int16_t *neighbors,
+ uint8_t *token_cache,
+ int c) {
+ return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+ token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
// 128 lists of probabilities are stored for the following ONE node probs:
typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
[PREV_COEF_CONTEXTS]
[UNCONSTRAINED_NODES][2];
-extern void vp9_full_to_model_count(unsigned int *model_count,
- unsigned int *full_count);
-extern void vp9_full_to_model_counts(
- vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-void vp9_model_to_full_probs_sb(
- vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
- vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);
-
extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
-static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_4x4;
}
}
-static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_4x4;
+ case DCT_ADST:
+ return vp9_col_iscan_4x4;
+ default:
+ return vp9_default_iscan_4x4;
+ }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_8x8;
}
}
-static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_8x8;
+ case DCT_ADST:
+ return vp9_col_iscan_8x8;
+ default:
+ return vp9_default_iscan_8x8;
+ }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_16x16;
}
}
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_16x16;
+ case DCT_ADST:
+ return vp9_col_iscan_16x16;
+ default:
+ return vp9_default_iscan_16x16;
+ }
+}
+
enum { VP9_COEF_UPDATE_PROB = 252 };
#endif // VP9_COMMON_VP9_ENTROPY_H_
#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_modecont.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vpx_mem/vpx_mem.h"
}
};
+static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+ [VP9_INTER_MODES - 1] = {
+ {2, 173, 34}, // 0 = both zero mv
+ {7, 145, 85}, // 1 = one zero mv + one a predicted mv
+ {7, 166, 63}, // 2 = two predicted mvs
+ {7, 94, 66}, // 3 = one predicted/zero and one new mv
+ {8, 64, 46}, // 4 = two new mvs
+ {17, 81, 31}, // 5 = one intra neighbour + x
+ {25, 29, 30}, // 6 = two intra neighbours
+};
+
/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
-DC_PRED, 2, /* 0 = DC_NODE */
void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
unsigned int (*ct_16x16p)[2]) {
ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
- ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +
- tx_count_16x16p[TX_16X16];
+ ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
}
void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
unsigned int (*ct_8x8p)[2]) {
- ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
- ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
+ ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
+ ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
}
const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {
192, 128, 64
};
-void vp9_init_mbmode_probs(VP9_COMMON *x) {
- vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,
- sizeof(default_if_uv_probs));
- vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,
- sizeof(default_kf_uv_probs));
- vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,
- sizeof(default_if_y_probs));
-
- vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
- sizeof(vp9_switchable_interp_prob));
-
- vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
- sizeof(vp9_partition_probs));
-
- vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,
- sizeof(default_intra_inter_p));
- vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,
- sizeof(default_comp_inter_p));
- vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,
- sizeof(default_comp_ref_p));
- vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,
- sizeof(default_single_ref_p));
- vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,
- sizeof(vp9_default_tx_probs_32x32p));
- vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,
- sizeof(vp9_default_tx_probs_16x16p));
- vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,
- sizeof(vp9_default_tx_probs_8x8p));
- vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,
- sizeof(vp9_default_mbskip_probs));
+void vp9_init_mbmode_probs(VP9_COMMON *cm) {
+ vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
+ vp9_copy(cm->kf_uv_mode_prob, default_kf_uv_probs);
+ vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
+ vp9_copy(cm->fc.switchable_interp_prob, vp9_switchable_interp_prob);
+ vp9_copy(cm->fc.partition_prob, vp9_partition_probs);
+ vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
+ vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
+ vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
+ vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
+ vp9_copy(cm->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p);
+ vp9_copy(cm->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p);
+ vp9_copy(cm->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p);
+ vp9_copy(cm->fc.mbskip_probs, vp9_default_mbskip_probs);
}
const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
vp9_tokens_from_tree(vp9_switchable_interp_encodings,
vp9_switchable_interp_tree);
vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-
vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
vp9_sb_mv_ref_tree, NEARESTMV);
}
void vp9_init_mode_contexts(VP9_COMMON *pc) {
vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));
- vpx_memcpy(pc->fc.inter_mode_probs,
- vp9_default_inter_mode_probs,
- sizeof(vp9_default_inter_mode_probs));
+ vpx_memcpy(pc->fc.inter_mode_probs, default_inter_mode_probs,
+ sizeof(default_inter_mode_probs));
}
void vp9_accum_mv_refs(VP9_COMMON *pc,
},
};
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
- if (mv->row == 0 && mv->col == 0)
- return MV_JOINT_ZERO;
- else if (mv->row == 0 && mv->col != 0)
- return MV_JOINT_HNZVZ;
- else if (mv->row != 0 && mv->col == 0)
- return MV_JOINT_HZVNZ;
- else
- return MV_JOINT_HNZVNZ;
-}
-
#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
}
}
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
int usehp) {
const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
mvctx->joints[j]++;
counts_to_context(&nmv_count->comps[1], usehp);
}
-void vp9_counts_to_nmv_context(
- nmv_context_counts *nmv_count,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]) {
- int i, j, k;
- vp9_counts_process(nmv_count, usehp);
- vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- prob->joints,
- branch_ct_joint,
- nmv_count->joints, 0);
- for (i = 0; i < 2; ++i) {
- const uint32_t s0 = nmv_count->comps[i].sign[0];
- const uint32_t s1 = nmv_count->comps[i].sign[1];
-
- prob->comps[i].sign = get_binary_prob(s0, s1);
- branch_ct_sign[i][0] = s0;
- branch_ct_sign[i][1] = s1;
- vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
- vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- prob->comps[i].class0,
- branch_ct_class0[i],
- nmv_count->comps[i].class0, 0);
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- const uint32_t b0 = nmv_count->comps[i].bits[j][0];
- const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
- prob->comps[i].bits[j] = get_binary_prob(b0, b1);
- branch_ct_bits[i][j][0] = b0;
- branch_ct_bits[i][j][1] = b1;
- }
- }
- for (i = 0; i < 2; ++i) {
- for (k = 0; k < CLASS0_SIZE; ++k) {
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
- branch_ct_class0_fp[i][k],
- nmv_count->comps[i].class0_fp[k], 0);
- }
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].fp,
- branch_ct_fp[i],
- nmv_count->comps[i].fp, 0);
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
- const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
- const uint32_t hp0 = nmv_count->comps[i].hp[0];
- const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
- prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
- branch_ct_class0_hp[i][0] = c0_hp0;
- branch_ct_class0_hp[i][1] = c0_hp1;
-
- prob->comps[i].hp = get_binary_prob(hp0, hp1);
- branch_ct_hp[i][0] = hp0;
- branch_ct_hp[i][1] = hp1;
- }
- }
-}
-
static unsigned int adapt_probs(unsigned int i,
vp9_tree tree,
vp9_prob this_probs[],
nmv_component comps[2];
} nmv_context;
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);
+static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+ if (mv->row == 0) {
+ return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+ } else {
+ return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+ }
+}
+
MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
nmv_component_counts comps[2];
} nmv_context_counts;
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
- int usehp);
+void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+ int usehp);
extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
- nmv_context_counts *NMVcount,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]);
+
void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
#endif // VP9_COMMON_VP9_ENTROPYMV_H_
#include "./vpx_config.h"
#define LOG2_MI_SIZE 3
+#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE) // 64 = 2^6
-#define MI_SIZE (1 << LOG2_MI_SIZE)
-#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)
+#define MI_SIZE (1 << LOG2_MI_SIZE) // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE) // mi-units per max block
+
+#define MI_MASK (MI_BLOCK_SIZE - 1)
typedef enum BLOCK_SIZE_TYPE {
BLOCK_SIZE_AB4X4,
if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- return ((cur_mb->bmi + 1 + b)->as_mode.first);
+ return ((cur_mb->bmi + 1 + b)->as_mode);
} else {
return cur_mb->mbmi.mode;
}
}
assert(b == 1 || b == 3);
- return (cur_mb->bmi + b - 1)->as_mode.first;
+ return (cur_mb->bmi + b - 1)->as_mode;
}
static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- return ((cur_mb->bmi + 2 + b)->as_mode.first);
+ return ((cur_mb->bmi + 2 + b)->as_mode);
} else {
return cur_mb->mbmi.mode;
}
}
- return (cur_mb->bmi + b - 2)->as_mode.first;
+ return (cur_mb->bmi + b - 2)->as_mode;
}
#endif // VP9_COMMON_VP9_FINDNEARMV_H_
#define pair_set_epi16(a, b) \
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
-// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
+// Constants:
+// for (int i = 1; i< 32; ++i)
+// printf("static const int cospi_%d_64 = %.0f;\n", i,
+// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
static const int cospi_1_64 = 16364;
static const int cospi_2_64 = 16305;
+++ /dev/null
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_onyxc_int.h"
-
-#define MAX_REGIONS 24000
-#ifndef NULL
-#define NULL 0
-#endif
-
-#define min_mbs_in_region 3
-
-// this linked list structure holds equivalences for connected
-// component labeling
-struct list_el {
- int label;
- int seg_value;
- int count;
- struct list_el *next;
-};
-typedef struct list_el item;
-
-// connected colorsegments
-typedef struct {
- int min_x;
- int min_y;
- int max_x;
- int max_y;
- int64_t sum_x;
- int64_t sum_y;
- int pixels;
- int seg_value;
- int label;
-} segment_info;
-
-
-typedef enum {
- SEGMENT_MODE,
- SEGMENT_MV,
- SEGMENT_REFFRAME,
- SEGMENT_SKIPPED
-} SEGMENT_TYPE;
-
-
-// this merges the two equivalence lists and
-// then makes sure that every label points to the same
-// equivalence list
-void merge(item *labels, int u, int v) {
- item *a = labels[u].next;
- item *b = labels[v].next;
- item c;
- item *it = &c;
- int count;
-
- // check if they are already merged
- if (u == v || a == b)
- return;
-
- count = a->count + b->count;
-
- // merge 2 sorted linked lists.
- while (a != NULL && b != NULL) {
- if (a->label < b->label) {
- it->next = a;
- a = a->next;
- } else {
- it->next = b;
- b = b->next;
- }
-
- it = it->next;
- }
-
- if (a == NULL)
- it->next = b;
- else
- it->next = a;
-
- it = c.next;
-
- // make sure every equivalence in the linked list points to this new ll
- while (it != NULL) {
- labels[it->label].next = c.next;
- it = it->next;
- }
- c.next->count = count;
-
-}
-
-void segment_via_mode_info(VP9_COMMON *oci, int how) {
- MODE_INFO *mi = oci->mi;
- int i, j;
- int mb_index = 0;
-
- int label = 1;
- int pitch = oci->mb_cols;
-
- // holds linked list equivalences
- // the max should probably be allocated at a higher level in oci
- item equivalences[MAX_REGIONS];
- int eq_ptr = 0;
- item labels[MAX_REGIONS];
- segment_info segments[MAX_REGIONS];
- int label_count = 1;
- int labeling[400 * 300];
- int *lp = labeling;
-
- label_count = 1;
- memset(labels, 0, sizeof(labels));
- memset(segments, 0, sizeof(segments));
-
- /* Go through each macroblock first pass labelling */
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- // int above seg_value, left seg_value, this seg_value...
- int a = -1, l = -1, n = -1;
-
- // above label, left label
- int al = -1, ll = -1;
- if (i) {
- al = lp[j - pitch];
- a = labels[al].next->seg_value;
- }
- if (j) {
- ll = lp[j - 1];
- l = labels[ll].next->seg_value;
- }
-
- // what setting are we going to do the implicit segmentation on
- switch (how) {
- case SEGMENT_MODE:
- n = mi[mb_index].mbmi.mode;
- break;
- case SEGMENT_MV:
- n = mi[mb_index].mbmi.mv[0].as_int;
- if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)
- n = -9999999;
- break;
- case SEGMENT_REFFRAME:
- n = mi[mb_index].mbmi.ref_frame[0];
- break;
- case SEGMENT_SKIPPED:
- n = mi[mb_index].mbmi.mb_skip_coeff;
- break;
- }
-
- // above and left both have the same seg_value
- if (n == a && n == l) {
- // pick the lowest label
- lp[j] = (al < ll ? al : ll);
- labels[lp[j]].next->count++;
-
- // merge the above and left equivalencies
- merge(labels, al, ll);
- }
- // this matches above seg_value
- else if (n == a) {
- // give it the same label as above
- lp[j] = al;
- labels[al].next->count++;
- }
- // this matches left seg_value
- else if (n == l) {
- // give it the same label as above
- lp[j] = ll;
- labels[ll].next->count++;
- } else {
- // new label doesn't match either
- item *e = &labels[label];
- item *nl = &equivalences[eq_ptr++];
- lp[j] = label;
- nl->label = label;
- nl->next = 0;
- nl->seg_value = n;
- nl->count = 1;
- e->next = nl;
- label++;
- }
- mb_index++;
- }
- mb_index++;
- }
- lp = labeling;
-
- // give new labels to regions
- for (i = 1; i < label; i++)
- if (labels[i].next->count > min_mbs_in_region &&
- labels[labels[i].next->label].label == 0) {
- segment_info *cs = &segments[label_count];
- cs->label = label_count;
- labels[labels[i].next->label].label = label_count++;
- labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
- cs->seg_value = labels[labels[i].next->label].seg_value;
- cs->min_x = oci->mb_cols;
- cs->min_y = oci->mb_rows;
- cs->max_x = 0;
- cs->max_y = 0;
- cs->sum_x = 0;
- cs->sum_y = 0;
- cs->pixels = 0;
- }
-
- lp = labeling;
-
- // this is just to gather stats...
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- const int old_lab = labels[lp[j]].next->label;
- const int lab = labels[old_lab].label;
- segment_info *cs = &segments[lab];
-
- cs->min_x = MIN(cs->min_x, j);
- cs->max_x = MAX(cs->max_x, j);
- cs->min_y = MIN(cs->min_y, i);
- cs->max_y = MAX(cs->max_y, i);
- cs->sum_x += j;
- cs->sum_y += i;
- cs->pixels++;
-
- lp[j] = lab;
- mb_index++;
- }
- mb_index++;
- }
-
- {
- lp = labeling;
- printf("labelling \n");
- mb_index = 0;
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- printf("%4d", lp[j]);
- }
- printf(" ");
- for (j = 0; j < oci->mb_cols; j++, mb_index++) {
- // printf("%3d",mi[mb_index].mbmi.mode );
- printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
- mi[mb_index].mbmi.mv[0].as_mv.col);
- }
- printf("\n");
- ++mb_index;
- }
- printf("\n");
- }
-}
-
const int col_step = 1 << xd->plane[plane].subsampling_x;
struct buf_2d * const dst = &xd->plane[plane].dst;
uint8_t* const dst0 = dst->buf;
- MODE_INFO* const mi0 = xd->mode_info_context;
- unsigned int mask_16x16[64 / MI_SIZE] = {0};
- unsigned int mask_8x8[64 / MI_SIZE] = {0};
- unsigned int mask_4x4[64 / MI_SIZE] = {0};
- unsigned int mask_4x4_int[64 / MI_SIZE] = {0};
- struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];
+ unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+ struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
+ MODE_INFO *mi = xd->mode_info_context;
+ int row_step_stride = cm->mode_info_stride * row_step;
- for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
unsigned int mask_16x16_c = 0;
unsigned int mask_8x8_c = 0;
unsigned int mask_4x4_c = 0;
unsigned int border_mask;
// Determine the vertical edges that need filtering
- for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
- const MODE_INFO * const mi = xd->mode_info_context;
+ for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const int skip_this = mi[c].mbmi.mb_skip_coeff
&& mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
// left edge of current unit is block/partition edge -> no skip
mask_4x4_c & border_mask,
mask_4x4_int[r], lfi[r]);
dst->buf += 8 * dst->stride;
- xd->mode_info_context += cm->mode_info_stride * row_step;
+ mi += row_step_stride;
}
// Now do horizontal pass
dst->buf = dst0;
- xd->mode_info_context = mi0;
- for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
mask_4x4[r],
mask_4x4_int_r, mi_row + r == 0, lfi[r]);
dst->buf += 8 * dst->stride;
- xd->mode_info_context += cm->mode_info_stride * row_step;
}
}
-void vp9_loop_filter_frame(VP9_COMMON *cm,
- MACROBLOCKD *xd,
- int frame_filter_level,
- int y_only) {
+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int frame_filter_level, int y_only) {
int mi_row, mi_col;
// Initialize the loop filter for this frame.
vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
int plane;
setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);
+++ /dev/null
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-
-void vp9_setup_block_dptrs(MACROBLOCKD *mb,
- int subsampling_x, int subsampling_y) {
- int i;
-
- for (i = 0; i < MAX_MB_PLANE; i++) {
- mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
- mb->plane[i].subsampling_x = i ? subsampling_x : 0;
- mb->plane[i].subsampling_y = i ? subsampling_y : 0;
- }
-#if CONFIG_ALPHA
- // TODO(jkoleszar): Using the Y w/h for now
- mb->plane[3].subsampling_x = 0;
- mb->plane[3].subsampling_y = 0;
-#endif
-}
+++ /dev/null
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_modecont.h"
-
-const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
- [VP9_INTER_MODES - 1] = {
- {2, 173, 34}, // 0 = both zero mv
- {7, 145, 85}, // 1 = one zero mv + one a predicted mv
- {7, 166, 63}, // 2 = two predicted mvs
- {7, 94, 66}, // 3 = one predicted/zero and one new mv
- {8, 64, 46}, // 4 = two new mvs
- {17, 81, 31}, // 5 = one intra neighbour + x
- {25, 29, 30}, // 6 = two intra neighbours
-};
+++ /dev/null
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_MODECONT_H_
-#define VP9_COMMON_VP9_MODECONT_H_
-
-#include "vp9/common/vp9_entropy.h"
-
-extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
- [VP9_INTER_MODES - 1];
-
-#endif // VP9_COMMON_VP9_MODECONT_H_
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
-struct mv32 {
+typedef struct {
int32_t row;
int32_t col;
-};
+} MV32;
typedef union int_mv32 {
- uint64_t as_int;
- struct mv32 as_mv;
+ uint64_t as_int;
+ MV32 as_mv;
} int_mv32; /* facilitates faster equality tests and copies */
#endif // VP9_COMMON_VP9_MV_H_
#define MAX_LAG_BUFFERS 25
typedef struct frame_contexts {
+ // y_mode, uv_mode, partition
vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
[PARTITION_TYPES - 1];
-
- nmv_context nmvc;
- nmv_context pre_nmvc;
- /* interframe intra mode probs */
vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
- /* interframe intra mode probs */
unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES];
unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ // coeff
vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
[COEF_BANDS][PREV_COEF_CONTEXTS];
- nmv_context_counts NMVcount;
+ // switchable_interp
vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS - 1];
vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
+ [VP9_SWITCHABLE_FILTERS - 1];
unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS];
-
+ // inter_mode
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+ // intra_inter, comp_inter, single_ref, comp_ref
vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
vp9_prob single_ref_prob[REF_CONTEXTS][2];
unsigned int single_ref_count[REF_CONTEXTS][2][2];
unsigned int comp_ref_count[REF_CONTEXTS][2];
+ // tx_probs
vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+ // mbskip
vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS];
unsigned int mbskip_count[MBSKIP_CONTEXTS][2];
+
+ // mv
+ nmv_context nmvc;
+ nmv_context pre_nmvc;
+ nmv_context_counts NMVcount;
} FRAME_CONTEXT;
typedef enum {
typedef struct VP9Common {
struct vpx_internal_error_info error;
- DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
- DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
#if CONFIG_ALPHA
- DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
#endif
int width;
buf[new_idx]++;
}
-static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {
- return 2 * ((cm->mb_cols + 3) & ~3);
+static int mi_cols_aligned_to_sb(int n_mis) {
+ return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
}
-static INLINE void set_partition_seg_context(VP9_COMMON *cm,
- MACROBLOCKD *xd,
+static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col) {
xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+ xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
}
static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
for (bx = 0; bx < 16; bx += 4) {
if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
|| (ppflags->display_mb_modes_flag & I4X4_PRED)) {
- Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
- U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
- V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
+ Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+ U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+ V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
0xc000, y_stride);
// TBD prediction functions for various bitstream signals
// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
+unsigned char vp9_get_pred_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
PRED_ID pred_id) {
int pred_context;
const MODE_INFO *const mi = xd->mode_info_context;
// This function returns a context probability for coding a given
// prediction signal
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
+vp9_prob vp9_get_pred_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ PRED_ID pred_id) {
const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
switch (pred_id) {
// This function returns a context probability ptr for coding a given
// prediction signal
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *cm, const MACROBLOCKD * xd,
PRED_ID pred_id) {
const MODE_INFO *const mi = xd->mode_info_context;
const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
// This function sets the status of the given prediction signal.
// I.e. is the predicted value for the given signal correct.
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
+void vp9_set_pred_flag(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, PRED_ID pred_id,
unsigned char pred_flag) {
const int mis = xd->mode_info_stride;
- BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
const int bh = 1 << mi_height_log2(bsize);
const int bw = 1 << mi_width_log2(bsize);
#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
switch (pred_id) {
case PRED_SEG_ID:
- for (y = 0; y < y_mis; y++) {
- for (x = 0; x < x_mis; x++) {
+ for (y = 0; y < y_mis; y++)
+ for (x = 0; x < x_mis; x++)
xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag;
- }
- }
break;
case PRED_MBSKIP:
- for (y = 0; y < y_mis; y++) {
- for (x = 0; x < x_mis; x++) {
+ for (y = 0; y < y_mis; y++)
+ for (x = 0; x < x_mis; x++)
xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
- }
- }
break;
default:
}
}
-
-// The following contain the guts of the prediction code used to
-// peredict various bitstream signals.
-
-// Macroblock segment id prediction function
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
- int mi_row, int mi_col) {
- const int mi_index = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(sb_type);
- const int bh = 1 << mi_height_log2(sb_type);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+ BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
const int xmis = MIN(cm->mi_cols - mi_col, bw);
- int segment_id = INT_MAX;
- int x, y;
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int x, y, segment_id = INT_MAX;
- for (y = 0; y < ymis; y++) {
- for (x = 0; x < xmis; x++) {
- const int index = mi_index + (y * cm->mi_cols + x);
- segment_id = MIN(segment_id, cm->last_frame_seg_map[index]);
- }
- }
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ segment_id = MIN(segment_id,
+ segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS);
return segment_id;
}
PRED_TX_SIZE = 8
} PRED_ID;
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
+unsigned char vp9_get_pred_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
PRED_ID pred_id);
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
+vp9_prob vp9_get_pred_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd,
PRED_ID pred_id);
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
PRED_ID pred_id);
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
- PRED_ID pred_id);
+unsigned char vp9_get_pred_flag(const MACROBLOCKD *xd, PRED_ID pred_id);
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
+void vp9_set_pred_flag(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, PRED_ID pred_id,
unsigned char pred_flag);
-
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
- int mi_row, int mi_col);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+ BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
return val;
}
-static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
- const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
- const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
-
- result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
- + scale->y_offset_q4;
- result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
- + scale->x_offset_q4;
- return result;
+static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
+ + scale->y_offset_q4,
+ ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
+ + scale->x_offset_q4
+ };
+ return res;
}
-static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
-
- result.as_mv.row = src_mv->as_mv.row << 1;
- result.as_mv.col = src_mv->as_mv.col << 1;
- return result;
+static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ mv->row << 1,
+ mv->col << 1
+ };
+ return res;
}
-static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp,
- int offset_q4) {
- int32_t scaled_mv;
- // returns the scaled and offset value of the mv component.
- scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4;
-
- return scaled_mv;
+static MV32 mv_q4_with_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
+ (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
+ };
+ return res;
}
-static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp,
- int offset_q4) {
- // returns the scaled and offset value of the mv component.
- (void)scale_fp;
- (void)offset_q4;
- return mv_q4;
+static MV32 mv_q4_without_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ mv->row,
+ mv->col
+ };
+ return res;
}
static void set_offsets_with_scaling(struct scale_factors *scale,
scale->scale_value_y = unscaled_value;
scale->set_scaled_offsets = set_offsets_without_scaling;
scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
- scale->scale_mv_component_q4 = mv_component_q4_without_scaling;
+ scale->scale_mv_q4 = mv_q4_without_scaling;
} else {
scale->scale_value_x = scale_value_x_with_scaling;
scale->scale_value_y = scale_value_y_with_scaling;
scale->set_scaled_offsets = set_offsets_with_scaling;
scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
- scale->scale_mv_component_q4 = mv_component_q4_with_scaling;
+ scale->scale_mv_q4 = mv_q4_with_scaling;
}
// TODO(agrange): Investigate the best choice of functions to use here
if (xd->mode_info_context) {
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- set_scale_factors(xd,
- mbmi->ref_frame[0] - 1,
- mbmi->ref_frame[1] - 1,
+ set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
cm->active_ref_scale);
}
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
- const int_mv *mv_q3,
+ const int_mv *src_mv,
const struct scale_factors *scale,
int w, int h, int weight,
- const struct subpix_fn_table *subpix) {
- int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale);
- src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
- scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](
- src, src_stride, dst, dst_stride,
- subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,
- subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,
- w, h);
-}
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int_mv *mv_q4,
- const struct scale_factors *scale,
- int w, int h, int weight,
- const struct subpix_fn_table *subpix) {
- const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row,
- scale->y_scale_fp,
- scale->y_offset_q4);
- const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col,
- scale->x_scale_fp,
- scale->x_offset_q4);
- const int subpel_x = scaled_mv_col_q4 & 15;
- const int subpel_y = scaled_mv_row_q4 & 15;
-
- src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+ const struct subpix_fn_table *subpix,
+ enum mv_precision precision) {
+ const MV32 mv = precision == MV_PRECISION_Q4
+ ? scale->scale_mv_q4(&src_mv->as_mv, scale)
+ : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+ const int subpel_x = mv.col & 15;
+ const int subpel_y = mv.row & 15;
+
+ src += (mv.row >> 4) * src_stride + (mv.col >> 4);
scale->predict[!!subpel_x][!!subpel_y][weight](
src, src_stride, dst, dst_stride,
subpix->filter_x[subpel_x], scale->x_step_q4,
xd->mb_to_bottom_edge);
scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
- vp9_build_inter_predictor_q4(pre, pre_stride,
- dst, arg->dst_stride[plane],
- &clamped_mv, &xd->scale_factor[which_mv],
- 4 << pred_w, 4 << pred_h, which_mv,
- &xd->subpix);
+ vp9_build_inter_predictor(pre, pre_stride,
+ dst, arg->dst_stride[plane],
+ &clamped_mv, &xd->scale_factor[which_mv],
+ 4 << pred_w, 4 << pred_h, which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
}
}
void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
}
-/*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
- int mb_row, int mb_col) {
- vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
- BLOCK_SIZE_MB16X16);
-}
-
// TODO(dkovalev: find better place for this function)
void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
const int_mv *mv_q3,
const struct scale_factors *scale,
int w, int h, int do_avg,
- const struct subpix_fn_table *subpix);
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int_mv *mv_q4,
- const struct scale_factors *scale,
- int w, int h, int do_avg,
- const struct subpix_fn_table *subpix);
+ const struct subpix_fn_table *subpix,
+ enum mv_precision precision);
static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *scale) {
}
}
-static void setup_pre_planes(MACROBLOCKD *xd,
- const YV12_BUFFER_CONFIG *src0,
- const YV12_BUFFER_CONFIG *src1,
+static void setup_pre_planes(MACROBLOCKD *xd, int i,
+ const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col,
const struct scale_factors *scale,
const struct scale_factors *scale_uv) {
- const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};
- int i, j;
-
- for (i = 0; i < 2; ++i) {
- const YV12_BUFFER_CONFIG *src = srcs[i];
- if (src) {
- uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
-
- for (j = 0; j < MAX_MB_PLANE; ++j) {
- struct macroblockd_plane *pd = &xd->plane[j];
- const struct scale_factors *sf = j ? scale_uv : scale;
- setup_pred_plane(&pd->pre[i],
- buffers[j], strides[j],
- mi_row, mi_col, sf ? &sf[i] : NULL,
- pd->subsampling_x, pd->subsampling_y);
- }
+ if (src) {
+ int j;
+ uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+ src->alpha_buffer};
+ int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+ src->alpha_stride};
+
+ for (j = 0; j < MAX_MB_PLANE; ++j) {
+ struct macroblockd_plane *pd = &xd->plane[j];
+ const struct scale_factors *sf = j ? scale_uv : scale;
+ setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
+ mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
}
}
}
-static void set_scale_factors(MACROBLOCKD *xd,
- int ref0, int ref1,
- struct scale_factors scale_factor[MAX_REF_FRAMES]) {
-
- xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];
- xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];
- xd->scale_factor_uv[0] = xd->scale_factor[0];
- xd->scale_factor_uv[1] = xd->scale_factor[1];
+static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1,
+ struct scale_factors sf[MAX_REF_FRAMES]) {
+ xd->scale_factor[0] = xd->scale_factor_uv[0] = sf[ref0 >= 0 ? ref0 : 0];
+ xd->scale_factor[1] = xd->scale_factor_uv[1] = sf[ref1 >= 0 ? ref1 : 0];
}
void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
#include "vp9/common/vp9_onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
+ DCT_DCT, // DC
+ ADST_DCT, // V
+ DCT_ADST, // H
+ DCT_DCT, // D45
+ ADST_ADST, // D135
+ ADST_DCT, // D117
+ DCT_ADST, // D153
+ DCT_ADST, // D27
+ ADST_DCT, // D63
+ ADST_ADST, // TM
+ DCT_DCT, // NEARESTMV
+ DCT_DCT, // NEARMV
+ DCT_DCT, // ZEROMV
+ DCT_DCT // NEWMV
+};
+
+
+static INLINE void d27_predictor(uint8_t *ypred_ptr, int y_stride,
+ int bw, int bh,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
int r, c;
// first column
for (r = 0; r < bh - 1; ++r) {
}
}
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride,
+static INLINE void d63_predictor(uint8_t *ypred_ptr, int y_stride,
int bw, int bh,
uint8_t *yabove_row, uint8_t *yleft_col) {
int r, c;
}
}
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d45_predictor(uint8_t *ypred_ptr, int y_stride,
+ int bw, int bh,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
int r, c;
for (r = 0; r < bh; ++r) {
for (c = 0; c < bw; ++c) {
}
}
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride,
+static INLINE void d117_predictor(uint8_t *ypred_ptr, int y_stride,
int bw, int bh,
uint8_t *yabove_row, uint8_t *yleft_col) {
int r, c;
}
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d135_predictor(uint8_t *ypred_ptr, int y_stride,
+ int bw, int bh,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
int r, c;
ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
yabove_row[-1] * 2 +
}
}
-static void d153_predictor(uint8_t *ypred_ptr,
- int y_stride,
- int bw, int bh,
- uint8_t *yabove_row,
- uint8_t *yleft_col) {
+static INLINE void d153_predictor(uint8_t *ypred_ptr,
+ int y_stride,
+ int bw, int bh,
+ uint8_t *yabove_row,
+ uint8_t *yleft_col) {
int r, c;
ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);
for (r = 1; r < bh; r++)
}
}
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize) {
- const struct macroblockd_plane* const pd = &xd->plane[0];
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride,
- pd->dst.buf, pd->dst.stride,
- xd->mode_info_context->mbmi.mode,
- bw, bh, xd->up_available, xd->left_available,
- 0 /*xd->right_available*/);
-}
-
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize) {
- const int bwl = b_width_log2(bsize), bw = 2 << bwl;
- const int bhl = b_height_log2(bsize), bh = 2 << bhl;
-
- vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
- xd->plane[1].dst.buf, xd->plane[1].dst.stride,
- xd->mode_info_context->mbmi.uv_mode,
- bw, bh, xd->up_available,
- xd->left_available, 0 /*xd->right_available*/);
- vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- xd->mode_info_context->mbmi.uv_mode,
- bw, bh, xd->up_available,
- xd->left_available, 0 /*xd->right_available*/);
-}
-
void vp9_predict_intra_block(MACROBLOCKD *xd,
int block_idx,
int bwl_in,
TX_SIZE tx_size,
int mode,
+ uint8_t *reference, int ref_stride,
uint8_t *predictor, int pre_stride) {
const int bwl = bwl_in - tx_size;
const int wmask = (1 << bwl) - 1;
const int txfm_block_size = 4 << tx_size;
assert(bwl >= 0);
- vp9_build_intra_predictors(predictor, pre_stride,
+ vp9_build_intra_predictors(reference, ref_stride,
predictor, pre_stride,
mode,
txfm_block_size,
have_top, have_left,
have_right);
}
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
- int block_idx,
- BLOCK_SIZE_TYPE bsize,
- int mode,
- uint8_t *predictor, int pre_stride) {
- vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,
- mode, predictor, pre_stride);
-}
int block_idx,
int bwl_in,
TX_SIZE tx_size,
- int mode,
+ int mode, uint8_t *ref, int ref_stride,
uint8_t *predictor, int pre_stride);
#endif // VP9_COMMON_VP9_RECONINTRA_H_
prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
specialize void vp9_build_intra_predictors
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sby_s
-
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sbuv_s
-
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra4x4_predict;
-
if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2
+specialize vp9_add_constant_residual_8x8 sse2 neon
prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2
+specialize vp9_add_constant_residual_16x16 sse2 neon
prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2
+specialize vp9_add_constant_residual_32x32 sse2 neon
fi
#
specialize vp9_mbloop_filter_vertical_edge sse2
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx
+specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
specialize vp9_mb_lpf_horizontal_edge_w sse2
specialize vp9_mbloop_filter_horizontal_edge sse2
prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx
+specialize vp9_loop_filter_horizontal_edge mmx neon
#
# post proc
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x8 mmx sse2
-# TODO(jingning): need to covert these functions into mmx/sse2 form
prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x4 sse2
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad4x4 mmx sse
+prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x64_avg sse2
+
+prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x64_avg sse2
+
+prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x32_avg sse2
+
+prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x16_avg sse2
+
+prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x32_avg sse2
+
+prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x32_avg sse2
+
+prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x16_avg sse2
+
+prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x8_avg sse2
+
+prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x16_avg sse2
+
+prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x8_avg sse2
+
+prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x4_avg sse2
+
+prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x8_avg sse
+
+prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x4_avg sse
+
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
specialize vp9_block_error sse2
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block sse2
+[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+
+prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b $ssse3_x86_64
+
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
+
#
# Structured Similarity (SSIM)
#
# fdct functions
prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht4x4
+specialize vp9_short_fht4x4 sse2
prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht8x8
+specialize vp9_short_fht8x8 sse2
prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
specialize vp9_short_fht16x16
specialize vp9_short_fdct32x32
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32_rd
+specialize vp9_short_fdct32x32_rd sse2
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct16x16 sse2
#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
+static int to_sbs(n_mis) {
+ return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE;
+}
+
static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
int *max_tile_off, int tile_idx,
int log2_n_tiles, int n_mis) {
- const int n_sbs = (n_mis + 7) >> 3;
+ const int n_sbs = to_sbs(n_mis);
const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles;
const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
int *delta_log2_n_tiles) {
- const int sb_cols = (cm->mb_cols + 3) >> 2;
+ const int sb_cols = to_sbs(cm->mi_cols);
int min_log2_n_tiles, max_log2_n_tiles;
for (max_log2_n_tiles = 0;
*/
-#include "vpx_config.h"
-
-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
#include <assert.h>
-#endif
+#include "./vpx_config.h"
#include "vp9/common/vp9_treecoder.h"
static void tree2tok(struct vp9_token *const p, vp9_tree t,
--- /dev/null
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_add_constant_residual_8x8_neon|
+ EXPORT |vp9_add_constant_residual_16x16_neon|
+ EXPORT |vp9_add_constant_residual_32x32_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ MACRO
+ LD_16x8 $src, $stride
+ vld1.8 {q8}, [$src], $stride
+ vld1.8 {q9}, [$src], $stride
+ vld1.8 {q10}, [$src], $stride
+ vld1.8 {q11}, [$src], $stride
+ vld1.8 {q12}, [$src], $stride
+ vld1.8 {q13}, [$src], $stride
+ vld1.8 {q14}, [$src], $stride
+ vld1.8 {q15}, [$src], $stride
+ MEND
+
+ MACRO
+ ADD_DIFF_16x8 $diff
+ vqadd.u8 q8, q8, $diff
+ vqadd.u8 q9, q9, $diff
+ vqadd.u8 q10, q10, $diff
+ vqadd.u8 q11, q11, $diff
+ vqadd.u8 q12, q12, $diff
+ vqadd.u8 q13, q13, $diff
+ vqadd.u8 q14, q14, $diff
+ vqadd.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ SUB_DIFF_16x8 $diff
+ vqsub.u8 q8, q8, $diff
+ vqsub.u8 q9, q9, $diff
+ vqsub.u8 q10, q10, $diff
+ vqsub.u8 q11, q11, $diff
+ vqsub.u8 q12, q12, $diff
+ vqsub.u8 q13, q13, $diff
+ vqsub.u8 q14, q14, $diff
+ vqsub.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ ST_16x8 $dst, $stride
+ vst1.8 {q8}, [$dst], $stride
+ vst1.8 {q9}, [$dst], $stride
+ vst1.8 {q10}, [$dst], $stride
+ vst1.8 {q11}, [$dst], $stride
+ vst1.8 {q12}, [$dst], $stride
+ vst1.8 {q13}, [$dst], $stride
+ vst1.8 {q14}, [$dst], $stride
+ vst1.8 {q15}, [$dst], $stride
+ MEND
+
+; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
+; int width, int height) {
+; int r, c;
+;
+; for (r = 0; r < height; r++) {
+; for (c = 0; c < width; c++)
+; dest[c] = clip_pixel(diff + dest[c]);
+;
+; dest += stride;
+; }
+;}
+;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 8, 8);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_8x8_neon| PROC
+ mov r3, r1 ; r3: save dest to r3
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ vld1.8 {d3}, [r1], r2
+ vld1.8 {d4}, [r1], r2
+ vld1.8 {d5}, [r1], r2
+ vld1.8 {d6}, [r1], r2
+ vld1.8 {d7}, [r1], r2
+ cmp r0, #0
+ bge DIFF_POSITIVE_8x8
+
+DIFF_NEGATIVE_8x8 ; diff < 0
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q8, r0
+
+ vqsub.u8 q0, q0, q8
+ vqsub.u8 q1, q1, q8
+ vqsub.u8 q2, q2, q8
+ vqsub.u8 q3, q3, q8
+ b DIFF_SAVE_8x8
+
+DIFF_POSITIVE_8x8 ; diff >= 0
+ usat r0, #8, r0
+ vdup.u8 q8, r0
+
+ vqadd.u8 q0, q0, q8
+ vqadd.u8 q1, q1, q8
+ vqadd.u8 q2, q2, q8
+ vqadd.u8 q3, q3, q8
+
+DIFF_SAVE_8x8
+ vst1.8 {d0}, [r3], r2
+ vst1.8 {d1}, [r3], r2
+ vst1.8 {d2}, [r3], r2
+ vst1.8 {d3}, [r3], r2
+ vst1.8 {d4}, [r3], r2
+ vst1.8 {d5}, [r3], r2
+ vst1.8 {d6}, [r3], r2
+ vst1.8 {d7}, [r3], r2
+
+ bx lr
+ ENDP
+
+;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 16, 16);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_16x16_neon| PROC
+ mov r3, r1
+ LD_16x8 r1, r2
+ cmp r0, #0
+ bge DIFF_POSITIVE_16x16
+
+|DIFF_NEGATIVE_16x16|
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ b DIFF_SAVE_16x16
+
+|DIFF_POSITIVE_16x16|
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+
+|DIFF_SAVE_16x16|
+ ST_16x8 r3, r2
+ bx lr
+ ENDP
+
+;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 32, 32);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_32x32_neon| PROC
+ push {r4,lr}
+ pld [r1]
+ mov r3, r1
+ add r4, r1, #16 ; r4 dest + 16 for second loop
+ cmp r0, #0
+ bge DIFF_POSITIVE_32x32
+
+|DIFF_NEGATIVE_32x32|
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+|DIFF_NEGATIVE_32x32_LOOP|
+ sub r0, #1
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ cmp r0, #2
+ moveq r1, r4
+ moveq r3, r4
+ cmp r0, #0
+ bne DIFF_NEGATIVE_32x32_LOOP
+ pop {r4,pc}
+
+|DIFF_POSITIVE_32x32|
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+|DIFF_POSITIVE_32x32_LOOP|
+ sub r0, #1
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ cmp r0, #2
+ moveq r1, r4
+ moveq r3, r4
+ cmp r0, #0
+ bne DIFF_POSITIVE_32x32_LOOP
+ pop {r4,pc}
+ ENDP
+
+ END
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decodframe.h"
#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dsubexp.h"
#include "vp9/decoder/vp9_treereader.h"
-#if CONFIG_DEBUG
-#include <assert.h>
-#endif
-
// #define DEBUG_DEC_MV
#ifdef DEBUG_DEC_MV
int dec_mvcount = 0;
#endif
static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
- return treed_read(r, vp9_intra_mode_tree, p);
+ return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+}
+
+static MB_PREDICTION_MODE read_inter_mode(vp9_reader *r, const vp9_prob *p) {
+ return (MB_PREDICTION_MODE)treed_read(r, vp9_sb_mv_ref_tree, p);
}
-static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
+static int read_segment_id(vp9_reader *r, MACROBLOCKD *xd) {
return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
}
-static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+static TX_SIZE read_selected_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+ BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
return txfm_size;
}
-static TX_SIZE get_txfm_size(VP9D_COMP *pbi, TXFM_MODE txfm_mode,
- BLOCK_SIZE_TYPE bsize, int select_cond,
- vp9_reader *r) {
+static TX_SIZE read_txfm_size(VP9D_COMP *pbi, TXFM_MODE txfm_mode,
+ BLOCK_SIZE_TYPE bsize, int select_cond,
+ vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
if (txfm_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
- return select_txfm_size(cm, xd, r, bsize);
+ return read_selected_txfm_size(cm, xd, bsize, r);
else if (txfm_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
return TX_32X32;
else if (txfm_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
return TX_4X4;
}
-static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
int mi_row, int mi_col, int segment_id) {
- const int mi_index = mi_row * cm->mi_cols + mi_col;
- const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
- const int bw = 1 << mi_width_log2(sb_type);
- const int bh = 1 << mi_height_log2(sb_type);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y;
- for (y = 0; y < ymis; y++) {
- for (x = 0; x < xmis; x++) {
- const int index = mi_index + (y * cm->mi_cols + x);
- cm->last_frame_seg_map[index] = segment_id;
- }
- }
+ assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS);
+
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
-static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
- int mi_row, int mi_col,
- vp9_reader *r) {
+static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+ vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int mis = cm->mode_info_stride;
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- // Read segmentation map if it is being updated explicitly this frame
- m->mbmi.segment_id = 0;
if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
- m->mbmi.segment_id = read_mb_segid(r, xd);
- set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id);
+ const int segment_id = read_segment_id(r, xd);
+ set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+ return segment_id;
+ } else {
+ return 0;
}
+}
- m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id,
- SEG_LVL_SKIP);
- if (!m->mbmi.mb_skip_coeff) {
- m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
- [m->mbmi.mb_skip_coeff]++;
+static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ int skip_coeff = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+ if (!skip_coeff) {
+ const uint8_t ctx = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
+ skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+ cm->fc.mbskip_count[ctx][skip_coeff]++;
}
+ return skip_coeff;
+}
+
+static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+ int mi_row, int mi_col, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ MB_MODE_INFO *const mbmi = &m->mbmi;
+ const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const int mis = cm->mode_info_stride;
- m->mbmi.txfm_size = get_txfm_size(pbi, cm->txfm_mode, m->mbmi.sb_type,
- 1, r);
+ mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
+ mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+ mbmi->txfm_size = read_txfm_size(pbi, cm->txfm_mode, bsize, 1, r);
+ mbmi->ref_frame[0] = INTRA_FRAME;
- // luma mode
- m->mbmi.ref_frame[0] = INTRA_FRAME;
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+ if (bsize >= BLOCK_SIZE_SB8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
const MB_PREDICTION_MODE L = xd->left_available ?
left_block_mode(m, 0) : DC_PRED;
- m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
+ mbmi->mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
} else {
+ // Only 4x4, 4x8, 8x4 blocks
+ const int bw = 1 << b_width_log2(bsize);
+ const int bh = 1 << b_height_log2(bsize);
int idx, idy;
- int bw = 1 << b_width_log2(m->mbmi.sb_type);
- int bh = 1 << b_height_log2(m->mbmi.sb_type);
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
- int ib = idy * 2 + idx;
- int k;
+ const int ib = idy * 2 + idx;
const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
left_block_mode(m, ib) : DC_PRED;
- m->bmi[ib].as_mode.first =
- read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
- for (k = 1; k < bh; ++k)
- m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;
- for (k = 1; k < bw; ++k)
- m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;
+ const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
+ cm->kf_y_mode_prob[A][L]);
+ m->bmi[ib].as_mode = b_mode;
+ if (bh == 2)
+ m->bmi[ib + 2].as_mode = b_mode;
+ if (bw == 2)
+ m->bmi[ib + 1].as_mode = b_mode;
}
}
- m->mbmi.mode = m->bmi[3].as_mode.first;
+
+ mbmi->mode = m->bmi[3].as_mode;
}
- m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
+ mbmi->uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[mbmi->mode]);
}
static int read_mv_component(vp9_reader *r,
int mag, d, fr, hp;
const int sign = vp9_read(r, mvcomp->sign);
const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+ const int class0 = mv_class == MV_CLASS_0;
// Integer part
- if (mv_class == MV_CLASS_0) {
+ if (class0) {
d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
} else {
int i;
// Fractional part
fr = treed_read(r, vp9_mv_fp_tree,
- mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+ class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
// High precision part (if hp is not used, the default value of the hp is 1)
- hp = usehp ? vp9_read(r,
- mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp)
+ hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
: 1;
- // result
+ // Result
mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
return sign ? -mag : mag;
}
-static void update_nmv(vp9_reader *r, vp9_prob *const p,
- const vp9_prob upd_p) {
+static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+ const nmv_context *ctx,
+ nmv_context_counts *counts, int usehp) {
+ const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+ MV diff = {0, 0};
+
+ usehp = usehp && vp9_use_mv_hp(ref);
+ if (mv_joint_vertical(j))
+ diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+
+ vp9_inc_mv(&diff, ref, counts, usehp);
+
+ mv->row = ref->row + diff.row;
+ mv->col = ref->col + diff.col;
+}
+
+static void update_mv(vp9_reader *r, vp9_prob *p, vp9_prob upd_p) {
if (vp9_read(r, upd_p)) {
#ifdef LOW_PRECISION_MV_UPDATE
*p = (vp9_read_literal(r, 7) << 1) | 1;
#else
- *p = (vp9_read_literal(r, 8));
+ *p = vp9_read_literal(r, 8);
#endif
}
}
-static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
- int usehp) {
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
int i, j, k;
#ifdef MV_GROUP_UPDATE
if (!vp9_read_bit(r))
return;
#endif
+
for (j = 0; j < MV_JOINTS - 1; ++j)
- update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
- update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+ nmv_component *const comp = &mvc->comps[i];
+
+ update_mv(r, &comp->sign, VP9_NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
- update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->classes[j], VP9_NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
- update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->class0[j], VP9_NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->bits[j], VP9_NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
+ nmv_component *const comp = &mvc->comps[i];
+
for (j = 0; j < CLASS0_SIZE; ++j)
for (k = 0; k < 3; ++k)
- update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->class0_fp[j][k], VP9_NMV_UPDATE_PROB);
for (j = 0; j < 3; ++j)
- update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->fp[j], VP9_NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
- update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
- update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB);
}
}
}
int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int seg_ref_active = vp9_segfeature_active(xd, segment_id,
- SEG_LVL_REF_FRAME);
+ FRAME_CONTEXT *const fc = &cm->fc;
- // Segment reference frame features not available.
- if (!seg_ref_active) {
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+ ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);
+ ref_frame[1] = NONE;
+ } else {
+ const int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);
int is_comp;
- int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);
if (cm->comp_pred_mode == HYBRID_PREDICTION) {
- is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]);
- cm->fc.comp_inter_count[comp_ctx][is_comp]++;
+ is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
+ fc->comp_inter_count[comp_ctx][is_comp]++;
} else {
is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
}
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
if (is_comp) {
- int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
- int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);
-
- ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
- b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]);
- cm->fc.comp_ref_count[ref_ctx][b]++;
+ const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ const int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);
+ const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+ fc->comp_ref_count[ref_ctx][b]++;
+ ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
} else {
- int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);
+ const int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);
ref_frame[1] = NONE;
- if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) {
- int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);
- int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]);
- ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME;
- cm->fc.single_ref_count[ref1_ctx][0][1]++;
- cm->fc.single_ref_count[ref2_ctx][1][b2]++;
+ if (vp9_read(r, fc->single_ref_prob[ref1_ctx][0])) {
+ const int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);
+ const int b = vp9_read(r, fc->single_ref_prob[ref2_ctx][1]);
+ ref_frame[0] = b ? ALTREF_FRAME : GOLDEN_FRAME;
+ fc->single_ref_count[ref1_ctx][0][1]++;
+ fc->single_ref_count[ref2_ctx][1][b]++;
} else {
ref_frame[0] = LAST_FRAME;
- cm->fc.single_ref_count[ref1_ctx][0][0]++;
+ fc->single_ref_count[ref1_ctx][0][0]++;
}
}
- } else {
- ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);
- ref_frame[1] = NONE;
}
}
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {
- return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);
-}
-
#ifdef VPX_MODE_COUNT
unsigned int vp9_mv_cont_count[5][4] = {
{ 0, 0, 0, 0 },
for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- fc->switchable_interp_prob[j][i] = vp9_read_prob_diff_update(r,
- fc->switchable_interp_prob[j][i]);
+ vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
}
static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
for (j = 0; j < VP9_INTER_MODES - 1; ++j)
if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- fc->inter_mode_probs[i][j] = vp9_read_prob_diff_update(r,
- fc->inter_mode_probs[i][j]);
+ vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
}
static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
COMPPREDMODE_TYPE mode = vp9_read_bit(r);
if (mode)
- mode += vp9_read_bit(r);
+ mode += vp9_read_bit(r);
return mode;
}
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
-
- if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
- nmv_context *const nmvc = &pbi->common.fc.nmvc;
- MACROBLOCKD *const xd = &pbi->mb;
- int i, j;
-
- read_inter_mode_probs(&cm->fc, r);
-
- if (cm->mcomp_filter_type == SWITCHABLE)
- read_switchable_interp_probs(&cm->fc, r);
-
- for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.intra_inter_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
-
- if (cm->allow_comp_inter_inter) {
- cm->comp_pred_mode = read_comp_pred_mode(r);
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
- for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.comp_inter_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);
- } else {
- cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
- }
-
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.single_ref_prob[i][0] =
- vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.single_ref_prob[i][1] =
- vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);
- }
-
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++)
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.comp_ref_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);
-
- // VP9_INTRA_MODES
- for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
- for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.y_mode_prob[j][i] =
- vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);
- }
- }
- }
- for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {
- for (i = 0; i < PARTITION_TYPES - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.partition_prob[INTER_FRAME][j][i] =
- vp9_read_prob_diff_update(r,
- cm->fc.partition_prob[INTER_FRAME][j][i]);
- }
- }
- }
-
- read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
- }
-}
-
-// This function either reads the segment id for the current macroblock from
-// the bitstream or if the value is temporally predicted asserts the predicted
-// value
-static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r) {
+static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+ vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- MODE_INFO *const mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+ bsize, mi_row, mi_col);
+ int segment_id;
if (!xd->segmentation_enabled)
return 0; // Default for disabled segmentation
- if (xd->update_mb_segmentation_map) {
- int segment_id;
-
- if (cm->temporal_update) {
- // Temporal coding of the segment id for this mb is enabled.
- // Get the context based probability for reading the
- // prediction status flag
- const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
- const int pred_flag = vp9_read(r, pred_prob);
- vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag);
-
- // If the value is flagged as correctly predicted
- // then use the predicted value, otherwise decode it explicitly
- segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type,
- mi_row, mi_col)
- : read_mb_segid(r, xd);
- } else {
- segment_id = read_mb_segid(r, xd); // Normal unpredicted coding mode
- }
+ if (!xd->update_mb_segmentation_map)
+ return pred_segment_id;
- set_segment_id(cm, mbmi, mi_row, mi_col, segment_id); // Side effect
- return segment_id;
+ if (cm->temporal_update) {
+ const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
+ const int pred_flag = vp9_read(r, pred_prob);
+ vp9_set_pred_flag(xd, bsize, PRED_SEG_ID, pred_flag);
+ segment_id = pred_flag ? pred_segment_id
+ : read_segment_id(r, xd);
} else {
- return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col);
+ segment_id = read_segment_id(r, xd);
}
+ set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+ return segment_id;
}
mb_to_bottom_edge);
}
-static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
- const nmv_context *ctx,
- nmv_context_counts *counts,
- int usehp) {
- const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
- MV diff = {0, 0};
-
- usehp = usehp && vp9_use_mv_hp(ref);
- if (mv_joint_vertical(j))
- diff.row = read_mv_component(r, &ctx->comps[0], usehp);
-
- if (mv_joint_horizontal(j))
- diff.col = read_mv_component(r, &ctx->comps[1], usehp);
-
- vp9_increment_nmv(&diff, ref, counts, usehp);
-
- mv->row = diff.row + ref->row;
- mv->col = diff.col + ref->col;
-}
-
static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
VP9D_COMP *pbi, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
}
static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
- MB_MODE_INFO *mbmi, vp9_reader *r) {
+ vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
- const int bw = 1 << b_width_log2(bsize);
- const int bh = 1 << b_height_log2(bsize);
+ const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
if (bsize >= BLOCK_SIZE_SB8X8) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- const int bsl = MIN(bwl, bhl);
- mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
- cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
+ const int size_group = MIN(3, MIN(bwl, bhl));
+ mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+ cm->fc.y_mode_counts[size_group][mbmi->mode]++;
} else {
+ // Only 4x4, 4x8, 8x4 blocks
+ const int bw = 1 << bwl, bh = 1 << bhl;
int idx, idy;
+
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
- int ib = idy * 2 + idx, k;
- int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
- mi->bmi[ib].as_mode.first = m;
- cm->fc.y_mode_counts[0][m]++;
- for (k = 1; k < bh; ++k)
- mi->bmi[ib + k * 2].as_mode.first = m;
- for (k = 1; k < bw; ++k)
- mi->bmi[ib + k].as_mode.first = m;
+ const int ib = idy * 2 + idx;
+ const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+ mi->bmi[ib].as_mode = b_mode;
+ cm->fc.y_mode_counts[0][b_mode]++;
+
+ if (bh == 2)
+ mi->bmi[ib + 2].as_mode = b_mode;
+ if (bw == 2)
+ mi->bmi[ib + 1].as_mode = b_mode;
}
}
- mbmi->mode = mi->bmi[3].as_mode.first;
+ mbmi->mode = mi->bmi[3].as_mode;
}
mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
return ref;
}
-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
- int mi_row, int mi_col,
- vp9_reader *r) {
+static void read_inter_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+ int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
nmv_context *const nmvc = &cm->fc.nmvc;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
int_mv *const mv0 = &mbmi->mv[0];
int_mv *const mv1 = &mbmi->mv[1];
const int bw = 1 << b_width_log2(bsize);
const int bh = 1 << b_height_log2(bsize);
- int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
- int j, idx, idy;
-
- mbmi->ref_frame[1] = NONE;
+ int idx, idy;
// Make sure the MACROBLOCKD mode info pointer is pointed at the
// correct entry for the current macroblock.
set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),
mi_col, 1 << mi_width_log2(bsize));
- mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
- mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
- mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
- mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
- // Read the macroblock segment id.
- mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r);
-
- mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,
- SEG_LVL_SKIP);
- if (!mbmi->mb_skip_coeff) {
- mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
- [mbmi->mb_skip_coeff]++;
- }
-
+ mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
+ mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
- mbmi->txfm_size = get_txfm_size(pbi, cm->txfm_mode, bsize,
- (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME), r);
+ mbmi->ref_frame[1] = NONE;
+ mbmi->txfm_size = read_txfm_size(pbi, cm->txfm_mode, bsize,
+ (!mbmi->mb_skip_coeff || mbmi->ref_frame[0] == INTRA_FRAME), r);
- // If reference frame is an Inter frame
if (mbmi->ref_frame[0] != INTRA_FRAME) {
int_mv nearest, nearby, best_mv;
int_mv nearest_second, nearby_second, best_mv_second;
vp9_prob *mv_ref_p;
+ MV_REFERENCE_FRAME ref0, ref1;
read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+ ref0 = mbmi->ref_frame[0];
+ ref1 = mbmi->ref_frame[1];
- {
#ifdef DEC_DEBUG
- if (dec_debug)
- printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
- xd->mode_info_context->mbmi.mv[0].as_mv.col);
+ if (dec_debug)
+ printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
+ xd->mode_info_context->mbmi.mv[0].as_mv.col);
#endif
- vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
- mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],
- cm->ref_frame_sign_bias);
-
- mv_ref_p = cm->fc.inter_mode_probs[
- mbmi->mb_mode_context[mbmi->ref_frame[0]]];
-
- // If the segment level skip mode enabled
- if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
- mbmi->mode = ZEROMV;
- } else if (bsize >= BLOCK_SIZE_SB8X8) {
- mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
- vp9_accum_mv_refs(cm, mbmi->mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
- }
+ vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+ ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
- if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd,
- mbmi->ref_mvs[mbmi->ref_frame[0]],
- &nearest, &nearby);
+ mv_ref_p = cm->fc.inter_mode_probs[mbmi->mb_mode_context[ref0]];
- best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int;
- }
+ if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
+ mbmi->mode = ZEROMV;
+ } else if (bsize >= BLOCK_SIZE_SB8X8) {
+ mbmi->mode = read_inter_mode(r, mv_ref_p);
+ vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref0]);
+ }
+ mbmi->uv_mode = DC_PRED;
+
+ // nearest, nearby
+ if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+ vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
+ best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
+ }
#ifdef DEC_DEBUG
- if (dec_debug)
- printf("[D %d %d] %d %d %d %d\n", ref_frame,
- mbmi->mb_mode_context[ref_frame],
- mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
+ if (dec_debug)
+ printf("[D %d %d] %d %d %d %d\n", ref_frame,
+ mbmi->mb_mode_context[ref_frame],
+ mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
#endif
- }
mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
? read_switchable_filter_type(pbi, r)
: cm->mcomp_filter_type;
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
+ if (ref1 > INTRA_FRAME) {
vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
- mbmi->ref_frame[1],
- mbmi->ref_mvs[mbmi->ref_frame[1]],
- cm->ref_frame_sign_bias);
+ ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias);
if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd,
- mbmi->ref_mvs[mbmi->ref_frame[1]],
- &nearest_second,
- &nearby_second);
- best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int;
+ vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
+ &nearest_second, &nearby_second);
+ best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
}
}
- mbmi->uv_mode = DC_PRED;
+
if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
int_mv blockmv, secondmv;
- int blockmode;
- int i;
- j = idy * 2 + idx;
+ const int j = idy * 2 + idx;
+ const int blockmode = read_inter_mode(r, mv_ref_p);
- blockmode = read_sb_mv_ref(r, mv_ref_p);
- vp9_accum_mv_refs(cm, blockmode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ vp9_accum_mv_refs(cm, blockmode, mbmi->mb_mode_context[ref0]);
if (blockmode == NEARESTMV || blockmode == NEARMV) {
- MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);
- if (rf2 > 0) {
+ if (ref1 > 0)
vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second,
&nearby_second, j, 1);
- }
}
switch (blockmode) {
case NEWMV:
- decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
- if (mbmi->ref_frame[1] > 0)
- decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ if (ref1 > 0)
+ read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
#ifdef VPX_MODE_COUNT
vp9_mv_cont_count[mv_contz][3]++;
break;
case NEARESTMV:
blockmv.as_int = nearest.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = nearest_second.as_int;
#ifdef VPX_MODE_COUNT
vp9_mv_cont_count[mv_contz][0]++;
break;
case NEARMV:
blockmv.as_int = nearby.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = nearby_second.as_int;
#ifdef VPX_MODE_COUNT
vp9_mv_cont_count[mv_contz][1]++;
break;
case ZEROMV:
blockmv.as_int = 0;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = 0;
#ifdef VPX_MODE_COUNT
vp9_mv_cont_count[mv_contz][2]++;
#endif
break;
default:
- break;
+ assert(!"Invalid inter mode value");
}
mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
- for (i = 1; i < bh; ++i)
- vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));
- for (i = 1; i < bw; ++i)
- vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));
+ if (bh == 2)
+ vpx_memcpy(&mi->bmi[j + 2], &mi->bmi[j], sizeof(mi->bmi[j]));
+ if (bw == 2)
+ vpx_memcpy(&mi->bmi[j + 1], &mi->bmi[j], sizeof(mi->bmi[j]));
mi->mbmi.mode = blockmode;
}
}
mv0->as_int = mi->bmi[3].as_mv[0].as_int;
mv1->as_int = mi->bmi[3].as_mv[1].as_int;
} else {
+ const int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ const int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+ const int mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ const int mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
switch (mbmi->mode) {
case NEARMV:
// Clip "next_nearest" so that it does not extend to far out of image
mb_to_right_edge,
mb_to_top_edge,
mb_to_bottom_edge);
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,
mb_to_right_edge,
mb_to_top_edge,
mb_to_right_edge,
mb_to_top_edge,
mb_to_bottom_edge);
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,
mb_to_right_edge,
mb_to_top_edge,
case ZEROMV:
mv0->as_int = 0;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
mv1->as_int = 0;
break;
case NEWMV:
- decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
- xd->allow_high_precision_mv);
- if (mbmi->ref_frame[1] > 0)
- decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
+ xd->allow_high_precision_mv);
+ if (ref1 > 0)
+ read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
break;
default:
-#if CONFIG_DEBUG
- assert(0);
-#endif
- break;
+ assert(!"Invalid inter mode value");
}
}
} else {
mv0->as_int = 0; // required for left and above block mv
- read_intra_block_modes(pbi, mi, mbmi, r);
+ read_intra_block_modes(pbi, mi, r);
}
}
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
- VP9_COMMON *cm = &pbi->common;
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
int k;
// TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
// vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.mbskip_probs[k] =
- vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
+ vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
+
+ if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
+ nmv_context *const nmvc = &pbi->common.fc.nmvc;
+ MACROBLOCKD *const xd = &pbi->mb;
+ int i, j;
+
+ read_inter_mode_probs(&cm->fc, r);
+
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ read_switchable_interp_probs(&cm->fc, r);
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
+
+ if (cm->allow_comp_inter_inter) {
+ cm->comp_pred_mode = read_comp_pred_mode(r);
+ if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+ } else {
+ cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
+ }
- mb_mode_mv_init(pbi, r);
+ if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++) {
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+ }
+
+ if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+
+ // VP9_INTRA_MODES
+ for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+ for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
+
+ for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
+ for (i = 0; i < PARTITION_TYPES - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
+
+ read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
+ }
}
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
- MACROBLOCKD* const xd,
- int mi_row,
- int mi_col,
- vp9_reader *r) {
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
- if (cm->frame_type == KEY_FRAME || cm->intra_only) {
- kfread_modes(pbi, mi, mi_row, mi_col, r);
- } else {
- read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
- }
+ if (cm->frame_type == KEY_FRAME || cm->intra_only)
+ read_intra_mode_info(pbi, mi, mi_row, mi_col, r);
+ else
+ read_inter_mode_info(pbi, mi, mi_row, mi_col, r);
if (1) {
- const int bw = 1 << mi_width_log2(mbmi->sb_type);
- const int bh = 1 << mi_height_log2(mbmi->sb_type);
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
const int y_mis = MIN(bh, cm->mi_rows - mi_row);
const int x_mis = MIN(bw, cm->mi_cols - mi_col);
- const int mis = cm->mode_info_stride;
int x, y;
for (y = 0; y < y_mis; y++)
for (x = !y; x < x_mis; x++)
- mi[y * mis + x] = *mi;
+ mi[y * cm->mode_info_stride + x] = *mi;
}
}
#include "vp9/decoder/vp9_onyxd_int.h"
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
- MACROBLOCKD* const xd,
- int mb_row,
- int mb_col,
- vp9_reader *r);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r);
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
+
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r);
#endif // VP9_DECODER_VP9_DECODEMV_H_
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_modecont.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_tile_common.h"
#include "vp9/decoder/vp9_decodframe.h"
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_dsubexp.h"
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
return start + len > start && start + len <= end;
}
+static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
+ const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+ return data > max ? max : data;
+}
+
static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
if (lossless) {
pc->txfm_mode = ONLY_4X4;
pc->txfm_mode = vp9_read_literal(r, 2);
if (pc->txfm_mode == ALLOW_32X32)
pc->txfm_mode += vp9_read_bit(r);
+
if (pc->txfm_mode == TX_MODE_SELECT) {
int i, j;
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_8x8p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_16x16p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_32x32p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);
- }
- }
- }
- }
-}
-
-static int get_unsigned_bits(unsigned int num_values) {
- int cat = 0;
- if (num_values <= 1)
- return 0;
- num_values--;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
-static int inv_recenter_nonneg(int v, int m) {
- if (v > 2 * m)
- return v;
-
- return v % 2 ? m - (v + 1) / 2 : m + v / 2;
-}
+ vp9_diff_update_prob(r, &pc->fc.tx_probs_8x8p[i][j]);
-static int decode_uniform(vp9_reader *r, int n) {
- int v;
- const int l = get_unsigned_bits(n);
- const int m = (1 << l) - n;
- if (!l)
- return 0;
-
- v = vp9_read_literal(r, l - 1);
- return v < m ? v : (v << 1) - m + vp9_read_bit(r);
-}
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &pc->fc.tx_probs_16x16p[i][j]);
-static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
- int i = 0, mk = 0, word;
- while (1) {
- const int b = i ? k + i - 1 : k;
- const int a = 1 << b;
- if (num_syms <= mk + 3 * a) {
- word = decode_uniform(r, num_syms - mk) + mk;
- break;
- } else {
- if (vp9_read_bit(r)) {
- i++;
- mk += a;
- } else {
- word = vp9_read_literal(r, b) + mk;
- break;
- }
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &pc->fc.tx_probs_32x32p[i][j]);
}
}
- return word;
-}
-
-static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
- const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
- return data > max ? max : data;
-}
-
-static int merge_index(int v, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (v < max1) {
- v = v * modulus + modulus / 2;
- } else {
- int w;
- v -= max1;
- w = v;
- v += (v + modulus - modulus / 2) / modulus;
- while (v % modulus == modulus / 2 ||
- w != v - (v + modulus - modulus / 2) / modulus) v++;
- }
- return v;
-}
-
-static int inv_remap_prob(int v, int m) {
- const int n = 255;
-
- v = merge_index(v, n - 1, MODULUS_PARAM);
- m--;
- if ((m << 1) <= n) {
- return 1 + inv_recenter_nonneg(v + 1, m);
- } else {
- return n - inv_recenter_nonneg(v + 1, n - 1 - m);
- }
-}
-
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {
- int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
- return (vp9_prob)inv_remap_prob(delp, oldp);
-}
-
-void vp9_init_dequantizer(VP9_COMMON *pc) {
- int q;
-
- for (q = 0; q < QINDEX_RANGE; q++) {
- // DC value
- pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
- pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
-
- // AC values
- pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
- pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
- }
}
static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
vp9_iht_add_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
break;
case TX_32X32:
int ss_txfrm_size, void *arg) {
MACROBLOCKD* const xd = arg;
struct macroblockd_plane *pd = &xd->plane[plane];
+ MODE_INFO *const mi = xd->mode_info_context;
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
block, ss_txfrm_size);
int b_mode;
int plane_b_size;
const int tx_ib = raster_block >> tx_size;
- const int mode = plane == 0 ? xd->mode_info_context->mbmi.mode
- : xd->mode_info_context->mbmi.uv_mode;
+ const int mode = plane == 0 ? mi->mbmi.mode
+ : mi->mbmi.uv_mode;
-
- if (plane == 0 && xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+ if (plane == 0 && mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
assert(bsize == BLOCK_SIZE_SB8X8);
- b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;
+ b_mode = mi->bmi[raster_block].as_mode;
} else {
b_mode = mode;
}
plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+ dst, pd->dst.stride,
dst, pd->dst.stride);
// Early exit if there are no coefficients
- if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ if (mi->mbmi.mb_skip_coeff)
return;
decode_block(plane, block, bsize, ss_txfrm_size, arg);
}
-static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
- assert(mbmi->ref_frame[0] != INTRA_FRAME);
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
-
- // prediction
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
- if (mbmi->mb_skip_coeff) {
- vp9_reset_sb_tokens_context(xd, bsize);
- } else {
- if (xd->segmentation_enabled)
- mb_init_dequantizer(&pbi->common, xd);
-
- if (!vp9_reader_has_error(r))
- vp9_decode_tokens(pbi, r, bsize);
-
- foreach_transformed_block(xd, bsize, decode_block, xd);
- }
-}
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+ MACROBLOCKD *const xd = &pbi->mb;
-static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
- if (mbmi->mb_skip_coeff) {
+ if (xd->mode_info_context->mbmi.mb_skip_coeff) {
vp9_reset_sb_tokens_context(xd, bsize);
+ return -1;
} else {
if (xd->segmentation_enabled)
mb_init_dequantizer(&pbi->common, xd);
- if (!vp9_reader_has_error(r))
- vp9_decode_tokens(pbi, r, bsize);
- }
-
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
-}
-
-
-static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);
- const int bw = 1 << bwl, bh = 1 << bhl;
- int n, eobtotal;
- VP9_COMMON *const pc = &pbi->common;
- MODE_INFO *const mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
- const int mis = pc->mode_info_stride;
-
- assert(mbmi->sb_type == bsize);
- assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
- vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
-
- // generate prediction
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
- if (mbmi->mb_skip_coeff) {
- vp9_reset_sb_tokens_context(xd, bsize);
- } else {
- // re-initialize macroblock dequantizer before detokenization
- if (xd->segmentation_enabled)
- mb_init_dequantizer(pc, xd);
-
- // dequantization and idct
- eobtotal = vp9_decode_tokens(pbi, r, bsize);
- if (eobtotal == 0) { // skip loopfilter
- for (n = 0; n < bw * bh; n++) {
- const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
- if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows)
- mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1;
- }
- } else {
- foreach_transformed_block(xd, bsize, decode_block, xd);
- }
+ // TODO(dkovalev) if (!vp9_reader_has_error(r))
+ return vp9_decode_tokens(pbi, r, bsize);
}
}
pd->left_context = cm->left_context[i] +
(((mi_row * 2) & 15) >> pd->subsampling_y);
}
- xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
// Distance of Mb to the various image edges. These are specified to 8th pel
// as they are always compared to values that are in 1/8th pel units
setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
}
-static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
+static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ const int ref = mbmi->ref_frame[i] - 1;
- // Select the appropriate reference frame for this MB
- const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
- const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
- xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, xd->scale_factor,
- xd->scale_factor_uv);
+ const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
+ xd->scale_factor[i] = cm->active_ref_scale[ref];
+ xd->scale_factor_uv[i] = cm->active_ref_scale[ref];
+ setup_pre_planes(xd, i, cfg, mi_row, mi_col,
+ &xd->scale_factor[i], &xd->scale_factor_uv[i]);
xd->corrupted |= cfg->corrupted;
-
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- // Select the appropriate reference frame for this MB
- const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
- const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
- xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, xd->scale_factor,
- xd->scale_factor_uv);
- xd->corrupted |= second_cfg->corrupted;
- }
}
static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
+ const int less8x8 = bsize < BLOCK_SIZE_SB8X8;
+ MB_MODE_INFO *mbmi;
- if (bsize < BLOCK_SIZE_SB8X8)
+ if (less8x8)
if (xd->ab_index > 0)
return;
+
set_offsets(pbi, bsize, mi_row, mi_col);
- vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
+ vp9_read_mode_info(pbi, mi_row, mi_col, r);
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
- decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
+ if (less8x8)
+ bsize = BLOCK_SIZE_SB8X8;
+
+ // Has to be called after set_offsets
+ mbmi = &xd->mode_info_context->mbmi;
+
+ if (mbmi->ref_frame[0] == INTRA_FRAME) {
+ // Intra reconstruction
+ decode_tokens(pbi, bsize, r);
+ foreach_transformed_block(xd, bsize, decode_block_intra, xd);
} else {
- set_refs(pbi, mi_row, mi_col);
- if (bsize < BLOCK_SIZE_SB8X8)
- decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
- else
- decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+ // Inter reconstruction
+ int eobtotal;
+
+ set_ref(pbi, 0, mi_row, mi_col);
+ if (mbmi->ref_frame[1] > INTRA_FRAME)
+ set_ref(pbi, 1, mi_row, mi_col);
+
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ eobtotal = decode_tokens(pbi, bsize, r);
+ if (less8x8) {
+ if (eobtotal >= 0)
+ foreach_transformed_block(xd, bsize, decode_block, xd);
+ } else {
+ assert(mbmi->sb_type == bsize);
+ if (eobtotal == 0)
+ vp9_set_pred_flag(xd, bsize, PRED_MBSKIP, 1); // skip loopfilter
+ else if (eobtotal > 0)
+ foreach_transformed_block(xd, bsize, decode_block, xd);
+ }
}
xd->corrupted |= vp9_reader_has_error(r);
}
if (bsize >= BLOCK_SIZE_SB8X8) {
int pl;
- int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
- // read the partition information
- xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = pc->above_seg_context + mi_col;
+ const int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
+ set_partition_seg_context(pc, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
if (idx == 0)
vp9_prob *const p = coef_probs[i][j][k][l] + m;
if (vp9_read(r, VP9_COEF_UPDATE_PROB))
- *p = vp9_read_prob_diff_update(r, *p);
+ vp9_diff_update_prob(r, p);
}
}
}
if (xd->mode_ref_lf_delta_update) {
int i;
- for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 6);
- xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
- }
- }
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ if (vp9_rb_read_bit(rb))
+ xd->ref_lf_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
- for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 6);
- xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
- }
- }
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ if (vp9_rb_read_bit(rb))
+ xd->mode_lf_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
}
}
}
static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
const int old = *delta_q;
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 4);
- *delta_q = vp9_rb_read_bit(rb) ? -value : value;
- }
+ if (vp9_rb_read_bit(rb))
+ *delta_q = vp9_rb_read_signed_literal(rb, 4);
return old != *delta_q;
}
cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- if (xd->lossless) {
- xd->itxm_add = vp9_idct_add_lossless_c;
- } else {
- xd->itxm_add = vp9_idct_add;
- }
+
+ xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
+ : vp9_idct_add;
}
static INTERPOLATIONFILTERTYPE read_interp_filter_type(
VP9_COMMON *const pc = &pbi->common;
int mi_row, mi_col;
- for (mi_row = pc->cur_tile_mi_row_start;
- mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
+ for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
- for (mi_col = pc->cur_tile_mi_col_start;
- mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
+ for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE)
decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
}
}
VP9_COMMON *const pc = &pbi->common;
const uint8_t *data_ptr = data + first_partition_size;
- const uint8_t* const data_end = pbi->source + pbi->source_sz;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
int tile_row, tile_col;
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
- MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));
+ vpx_memset(pc->above_context[0], 0,
+ sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
- vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(pc));
+ vpx_memset(pc->above_seg_context, 0,
+ sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
if (pbi->oxcf.inv_tile_order) {
const int n_cols = pc->tile_columns;
return vp9_rb_read_literal(rb, 16);
}
+void vp9_init_dequantizer(VP9_COMMON *pc) {
+ int q;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ // DC value
+ pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
+ pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
+
+ // AC values
+ pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
+ pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
+ }
+}
+
int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
int i;
vp9_reader header_bc, residual_bc;
read_coef_probs(pbi, &header_bc);
// Initialize xd pointers. Any reference should do for xd->pre, so use 0.
- setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
- 0, 0, NULL, NULL);
+ setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0,
+ NULL, NULL);
setup_dst_planes(xd, new_fb, 0, 0);
// Create the segmentation map structure and set to 0
if (!pc->last_frame_seg_map)
- CHECK_MEM_ERROR(pc->last_frame_seg_map,
+ CHECK_MEM_ERROR(pc, pc->last_frame_seg_map,
vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
- vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+ setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
// clear out the coeff buffer
for (i = 0; i < MAX_MB_PLANE; ++i)
set_prev_mi(pc);
- vp9_decode_mode_mvs_init(pbi, &header_bc);
+ vp9_prepare_read_mode_info(pbi, &header_bc);
decode_tiles(pbi, data, first_partition_size, &residual_bc);
void vp9_init_dequantizer(struct VP9Common *pc);
int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);
#endif // VP9_DECODER_VP9_DECODFRAME_H_
TX_SIZE txfm_size, const int16_t *dq,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
ENTROPY_CONTEXT above_ec, left_ec;
- int pt, c = 0, pad, default_eob;
+ int pt, c = 0;
int band;
vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
vp9_prob *prob;
vp9_coeff_count_model *coef_counts;
const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
- TX_TYPE tx_type = DCT_DCT;
- const int *scan, *nb;
+ const int16_t *scan, *nb;
uint8_t token_cache[1024];
const uint8_t * band_translate;
#if CONFIG_BALANCED_COEFTREE
switch (txfm_size) {
default:
case TX_4X4: {
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_4x4(xd, block_idx) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_4x4(xd, block_idx) : DCT_DCT;
scan = get_scan_4x4(tx_type);
above_ec = A[0] != 0;
left_ec = L[0] != 0;
- default_eob = 16;
band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block_idx & ((1 << sz) - 1);
- const int y = block_idx - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_8x8(xd) : DCT_DCT;
scan = get_scan_8x8(tx_type);
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
- default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block_idx & ((1 << sz) - 1);
- const int y = block_idx - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_16x16(xd) : DCT_DCT;
scan = get_scan_16x16(tx_type);
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
- default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
scan = vp9_default_scan_32x32;
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
- default_eob = 1024;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
while (1) {
int val;
if (c >= seg_eob)
break;
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
#if !CONFIG_BALANCED_COEFTREE
if (c >= seg_eob)
break;
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
--- /dev/null
+/*
+ Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/decoder/vp9_dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+ if (v > 2 * m)
+ return v;
+
+ return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+}
+
+static int decode_uniform(vp9_reader *r, int n) {
+ int v;
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (!l)
+ return 0;
+
+ v = vp9_read_literal(r, l - 1);
+ return v < m ? v : (v << 1) - m + vp9_read_bit(r);
+}
+
+
+static int merge_index(int v, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (v < max1) {
+ v = v * modulus + modulus / 2;
+ } else {
+ int w;
+ v -= max1;
+ w = v;
+ v += (v + modulus - modulus / 2) / modulus;
+ while (v % modulus == modulus / 2 ||
+ w != v - (v + modulus - modulus / 2) / modulus) v++;
+ }
+ return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+ static int inv_map_table[MAX_PROB - 1] = {
+ // generated by:
+ // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188,
+ 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91,
+ 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+ 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124,
+ 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
+ 141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
+ 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+ 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189,
+ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
+ 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
+ 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
+ 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+
+ };
+ // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+ v = inv_map_table[v];
+ m--;
+ if ((m << 1) <= MAX_PROB) {
+ return 1 + inv_recenter_nonneg(v + 1, m);
+ } else {
+ return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m);
+ }
+}
+
+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
+ int i = 0, mk = 0, word;
+ while (1) {
+ const int b = i ? k + i - 1 : k;
+ const int a = 1 << b;
+ if (num_syms <= mk + 3 * a) {
+ word = decode_uniform(r, num_syms - mk) + mk;
+ break;
+ } else {
+ if (vp9_read_bit(r)) {
+ i++;
+ mk += a;
+ } else {
+ word = vp9_read_literal(r, b) + mk;
+ break;
+ }
+ }
+ }
+ return word;
+}
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+ int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+ *p = (vp9_prob)inv_remap_prob(delp, *p);
+}
--- /dev/null
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_DSUBEXP_H_
+#define VP9_DECODER_VP9_DSUBEXP_H_
+
+#include "vp9/decoder/vp9_dboolhuff.h"
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+
+#endif // VP9_DECODER_VP9_DSUBEXP_H_
int initial_height;
} VP9D_COMP;
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
-
#endif // VP9_DECODER_VP9_TREEREADER_H_
return value;
}
+static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+ int bits) {
+ const int value = vp9_rb_read_literal(rb, bits);
+ return vp9_rb_read_bit(rb) ? -value : value;
+}
+
#endif // VP9_READ_BIT_BUFFER_
#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/encoder/vp9_bitstream.h"
#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_subexp.h"
#include "vp9/encoder/vp9_write_bit_buffer.h"
extern unsigned int active_section;
#endif
-#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
-#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
#ifdef MODE_STATS
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
}
#endif
-static int update_bits[255];
-
static INLINE void write_be32(uint8_t *p, int value) {
p[0] = value >> 24;
p[1] = value >> 16;
p[3] = value;
}
-
-
-int recenter_nonneg(int v, int m) {
- if (v > (m << 1))
- return v;
- else if (v >= m)
- return ((v - m) << 1);
- else
- return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
- int cat = 0;
- if ((num_values--) <= 1) return 0;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
int data, int max) {
vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-void encode_uniform(vp9_writer *w, int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0)
- return;
- m = (1 << l) - n;
- if (v < m) {
- vp9_write_literal(w, v, l - 1);
- } else {
- vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
- vp9_write_literal(w, (v - m) & 1, 1);
- }
-}
-
-int count_uniform(int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0) return 0;
- m = (1 << l) - n;
- if (v < m)
- return l - 1;
- else
- return l;
-}
-
-void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- encode_uniform(w, word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- vp9_write_literal(w, t, 1);
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- vp9_write_literal(w, word - mk, b);
- break;
- }
- }
- }
-}
-
-int count_term_subexp(int word, int k, int num_syms) {
- int count = 0;
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- count += count_uniform(word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- count++;
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- count += b;
- break;
- }
- }
- }
- return count;
-}
-
-static void compute_update_table() {
- int i;
- for (i = 0; i < 254; i++)
- update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
-}
-
-static int split_index(int i, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (i % modulus == modulus / 2) i = i / modulus;
- else i = max1 + i - (i + modulus - modulus / 2) / modulus;
- return i;
-}
-
-static int remap_prob(int v, int m) {
- const int n = 255;
- const int modulus = MODULUS_PARAM;
- int i;
- v--;
- m--;
- if ((m << 1) <= n)
- i = recenter_nonneg(v, m) - 1;
- else
- i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
- i = split_index(i, n - 1, modulus);
- return i;
-}
-
-static void write_prob_diff_update(vp9_writer *w,
- vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
-}
-
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- return update_bits[delp] * 256;
-}
-
-static int prob_update_savings(const unsigned int *ct,
- const vp9_prob oldp, const vp9_prob newp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- const int new_b = cost_branch256(ct, newp);
- const int update_b = 2048 + vp9_cost_upd256;
- return old_b - new_b - update_b;
-}
-
-static int prob_diff_update_savings_search(const unsigned int *ct,
- const vp9_prob oldp, vp9_prob *bestp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- int new_b, update_b, savings, bestsavings, step;
- vp9_prob newp, bestnewp;
-
- bestsavings = 0;
- bestnewp = oldp;
-
- step = (*bestp > oldp ? -1 : 1);
- for (newp = *bestp; newp != oldp; newp += step) {
- new_b = cost_branch256(ct, newp);
- update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
- }
- }
- *bestp = bestnewp;
- return bestsavings;
-}
-
-static int prob_diff_update_savings_search_model(const unsigned int *ct,
- const vp9_prob *oldp,
- vp9_prob *bestp,
- const vp9_prob upd,
- int b, int r) {
- int i, old_b, new_b, update_b, savings, bestsavings, step;
- int newp;
- vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
- vp9_model_to_full_probs(oldp, oldplist);
- vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
- for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
- old_b += cost_branch256(ct + 2 * i, oldplist[i]);
- old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
-
- bestsavings = 0;
- bestnewp = oldp[PIVOT_NODE];
-
- step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
- newp = *bestp;
- for (; newp != oldp[PIVOT_NODE]; newp += step) {
- if (newp < 1 || newp > 255) continue;
- newplist[PIVOT_NODE] = newp;
- vp9_model_to_full_probs(newplist, newplist);
- for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
- new_b += cost_branch256(ct + 2 * i, newplist[i]);
- new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
- update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
- vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
- }
- }
- *bestp = bestnewp;
- return bestsavings;
-}
-
-static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,
- unsigned int *ct) {
- vp9_prob newp;
- int savings;
- newp = get_binary_prob(ct[0], ct[1]);
- assert(newp >= 1);
- savings = prob_update_savings(ct, *oldp, newp, upd);
- if (savings > 0) {
- vp9_write(bc, 1, upd);
- vp9_write_prob(bc, newp);
- *oldp = newp;
- } else {
- vp9_write(bc, 0, upd);
- }
-}
-
-static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp,
- vp9_prob upd,
- unsigned int *ct) {
- vp9_prob newp;
- int savings;
- newp = get_binary_prob(ct[0], ct[1]);
- assert(newp >= 1);
- savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd);
- if (savings > 0) {
- vp9_write(bc, 1, upd);
- write_prob_diff_update(bc, newp, *oldp);
- *oldp = newp;
- } else {
- vp9_write(bc, 0, upd);
- }
-}
-
static void update_mode(
vp9_writer *w,
int n,
(unsigned int *)cpi->y_mode_count[j]);
}
+static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size,
+ BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+ const VP9_COMMON *const c = &cpi->common;
+ const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);
+ vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+ if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) {
+ vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+ if (bsize >= BLOCK_SIZE_SB32X32 && tx_size != TX_8X8)
+ vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+ }
+}
+
+static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+ vp9_writer *w) {
+ const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip_coeff = m->mbmi.mb_skip_coeff;
+ vp9_write(w, skip_coeff, vp9_get_pred_prob(&cpi->common, xd, PRED_MBSKIP));
+ return skip_coeff;
+ }
+}
+
void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) {
VP9_COMMON *const pc = &cpi->common;
int k;
*tp = p;
}
-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
+static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE m,
const vp9_prob *p) {
-#if CONFIG_DEBUG
assert(NEARESTMV <= m && m <= NEWMV);
-#endif
- write_token(bc, vp9_sb_mv_ref_tree, p,
+ write_token(w, vp9_sb_mv_ref_tree, p,
vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
}
-// This function writes the current macro block's segnment id to the bitstream
-// It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp9_writer *bc,
- const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
+
+static void write_segment_id(vp9_writer *w, const MACROBLOCKD *xd,
+ int segment_id) {
if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
- treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
- mi->segment_id, 3);
+ treed_write(w, vp9_segment_tree, xd->mb_segment_tree_probs, segment_id, 3);
}
// This function encodes the reference frame
// If the mb segment id wasn't predicted code explicitly
if (!prediction_flag)
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ write_segment_id(bc, xd, mi->segment_id);
} else {
// Normal unpredicted coding
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ write_segment_id(bc, xd, mi->segment_id);
}
}
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- skip_coeff = 1;
- } else {
- skip_coeff = m->mbmi.mb_skip_coeff;
- vp9_write(bc, skip_coeff,
- vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
- }
+ skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
vp9_write(bc, rf != INTRA_FRAME,
if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
!(rf != INTRA_FRAME &&
(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
- TX_SIZE sz = mi->txfm_size;
- const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE);
- vp9_write(bc, sz != TX_4X4, tx_probs[0]);
- if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
- vp9_write(bc, sz != TX_8X8, tx_probs[1]);
- if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
- vp9_write(bc, sz != TX_16X16, tx_probs[2]);
- }
+ write_selected_txfm_size(cpi, mi->txfm_size, mi->sb_type, bc);
}
if (rf == INTRA_FRAME) {
int bh = 1 << b_height_log2(mi->sb_type);
for (idy = 0; idy < 2; idy += bh)
for (idx = 0; idx < 2; idx += bw) {
- MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first;
+ const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
}
}
- write_intra_mode(bc, mi->uv_mode,
- pc->fc.uv_mode_prob[mode]);
+ write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
} else {
vp9_prob *mv_ref_p;
-
encode_ref_frame(cpi, bc);
-
mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
#ifdef ENTROPY_STATS
if (cpi->common.mcomp_filter_type == SWITCHABLE) {
write_token(bc, vp9_switchable_interp_tree,
- vp9_get_pred_probs(&cpi->common, xd,
- PRED_SWITCHABLE_INTERP),
+ vp9_get_pred_probs(&cpi->common, xd, PRED_SWITCHABLE_INTERP),
vp9_switchable_interp_encodings +
vp9_switchable_interp_map[mi->interp_filter]);
} else {
#ifdef ENTROPY_STATS
active_section = 11;
#endif
- vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+ vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(bc,
+ vp9_encode_mv(cpi, bc,
&cpi->mb.partition_info->bmi[j].second_mv.as_mv,
&mi->best_second_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
#ifdef ENTROPY_STATS
active_section = 5;
#endif
- vp9_encode_mv(bc,
+ vp9_encode_mv(cpi, bc,
&mi->mv[0].as_mv, &mi->best_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(bc,
+ vp9_encode_mv(cpi, bc,
&mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
}
const int ym = m->mbmi.mode;
const int mis = c->mode_info_stride;
const int segment_id = m->mbmi.segment_id;
- int skip_coeff;
if (xd->update_mb_segmentation_map)
- write_mb_segid(bc, &m->mbmi, xd);
+ write_segment_id(bc, xd, m->mbmi.segment_id);
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- skip_coeff = 1;
- } else {
- skip_coeff = m->mbmi.mb_skip_coeff;
- vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
- }
+ write_skip_coeff(cpi, segment_id, m, bc);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
- TX_SIZE sz = m->mbmi.txfm_size;
- const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);
- vp9_write(bc, sz != TX_4X4, tx_probs[0]);
- if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
- vp9_write(bc, sz != TX_8X8, tx_probs[1]);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
- vp9_write(bc, sz != TX_16X16, tx_probs[2]);
- }
- }
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT)
+ write_selected_txfm_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
left_block_mode(m, i) : DC_PRED;
- const int bm = m->bmi[i].as_mode.first;
+ const int bm = m->bmi[i].as_mode;
#ifdef ENTROPY_STATS
++intra_mode_stats[A][L][bm];
#endif
if (bsize >= BLOCK_SIZE_SB8X8) {
int pl;
- int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = cm->above_seg_context + mi_col;
+ const int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
// encode the partition information
if (idx == 0)
m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
- for (mi_row = c->cur_tile_mi_row_start;
- mi_row < c->cur_tile_mi_row_end;
+ for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
mi_row += 8, m_ptr += 8 * mis) {
m = m_ptr;
vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
- for (mi_col = c->cur_tile_mi_col_start;
- mi_col < c->cur_tile_mi_col_end;
- mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)
+ for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
BLOCK_SIZE_SB64X64);
}
if (l >= 3 && k == 0)
continue;
if (t == PIVOT_NODE)
- s = prob_diff_update_savings_search_model(
+ s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
else
- s = prob_diff_update_savings_search(
+ s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
if (s > 0 && newp != oldp)
u = 1;
if (l >= 3 && k == 0)
continue;
if (t == PIVOT_NODE)
- s = prob_diff_update_savings_search_model(
+ s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
else
- s = prob_diff_update_savings_search(
+ s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t],
*oldp, &newp, upd);
if (s > 0 && newp != *oldp)
#endif
if (u) {
/* send/use new probability */
- write_prob_diff_update(bc, newp, *oldp);
+ vp9_write_prob_diff_update(bc, newp, *oldp);
*oldp = newp;
}
}
bytes_packed = vp9_rb_bytes_written(&wb);
cx_data += bytes_packed;
- compute_update_table();
+ vp9_compute_update_table();
vp9_start_encode(&header_bc, cx_data);
unsigned char *data_ptr = cx_data + header_bc.pos;
TOKENEXTRA *tok[4][1 << 6], *tok_end;
- vpx_memset(cpi->common.above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(&cpi->common));
+ vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+ mi_cols_aligned_to_sb(pc->mi_cols));
tok[0][0] = cpi->tok;
for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
if (tile_row) {
int comp_pred_diff;
int single_pred_diff;
int64_t txfm_rd_diff[NB_TXFM_MODES];
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
// Bit flag for each mode whether it has high error in comparison to others.
unsigned int modes_with_high_error;
// Quantizer setings
int16_t *quant;
- uint8_t *quant_shift;
+ int16_t *quant_shift;
int16_t *zbin;
- int16_t *zrun_zbin_boost;
int16_t *round;
// Zbin Over Quant value
int **mvsadcost;
int mbmode_cost[MB_MODE_COUNT];
+ unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
unsigned char *active_ptr;
// note that token_costs is the cost when eob node is skipped
- vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
- vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
+ vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
int optimize;
temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j + i * 8] = temp_out[j] >> 1;
+ output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
}
}
temp_in[j] = input[j * pitch + i] << 2;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
// Rows
temp_in[j] = input[j * shortpitch + i] << 2;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
+ // TODO(cd): see quality impact of only doing
+ // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+ // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
int enc_debug = 0;
#endif
-void vp9_select_interp_filter_type(VP9_COMP *cpi);
-
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
unsigned int tmp;
// Create a list to sort to
- CHECK_MEM_ERROR(sortlist,
- vpx_calloc(sizeof(unsigned int),
- cpi->common.MBs));
+ CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int),
+ cpi->common.MBs));
// Copy map to sort list
vpx_memcpy(sortlist, cpi->mb_activity_map,
MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = &ctx->mic;
MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
-#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
- MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
-#endif
+
int mb_mode_index = ctx->best_mode_index;
const int mis = cpi->common.mode_info_stride;
const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
+ const MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
-#if CONFIG_DEBUG
assert(mb_mode < MB_MODE_COUNT);
assert(mb_mode_index < MAX_MODES);
assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
-#endif
-
assert(mi->mbmi.sb_type == bsize);
+
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
for (y = 0; y < bh; y++) {
}
}
}
+ // FIXME(rbultje) I'm pretty sure this should go to the end of this block
+ // (i.e. after the output_enabled)
if (bsize < BLOCK_SIZE_SB32X32) {
if (bsize < BLOCK_SIZE_MB16X16)
ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
#endif
} else {
- /*
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
- {
- int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
- cpi->rd_thresh_mult[mb_mode_index] =
- (cpi->rd_thresh_mult[mb_mode_index]
- >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
- MIN_THRESHMULT;
- cpi->rd_threshes[mb_mode_index] =
- (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
- * cpi->rd_thresh_mult[mb_mode_index];
-
- }
- */
// Note how often each mode chosen as best
cpi->mode_chosen_counts[mb_mode_index]++;
if (mbmi->ref_frame[0] != INTRA_FRAME
cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- }
-}
-
-static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
- int start_y, int height, int start_x, int width) {
- const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
- const int end_x = MIN(start_x + bw, width);
- const int end_y = MIN(start_y + bh, height);
- int x, y;
- unsigned seg_id = -1;
- buf += width * start_y;
- assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end);
- for (y = start_y; y < end_y; y++, buf += width) {
- for (x = start_x; x < end_x; x++) {
- seg_id = MIN(seg_id, buf[x]);
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
}
}
-
- return seg_id;
}
void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
/* segment ID */
if (xd->segmentation_enabled) {
- uint8_t *map =
- xd->update_mb_segmentation_map ?
- cpi->segmentation_map : cm->last_frame_seg_map;
- mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, cm->mi_rows, mi_col,
- cm->mi_cols);
+ uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
- assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
vp9_mb_init_quantizer(cpi, x);
if (xd->segmentation_enabled && cpi->seg0_cnt > 0
vpx_memcpy(cm->above_seg_context + mi_col, sa,
sizeof(PARTITION_CONTEXT) * mw);
vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(PARTITION_CONTEXT) * mh)
- ;}
+ sizeof(PARTITION_CONTEXT) * mh);
+}
static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
YV12_BUFFER_CONFIG *second_ref_fb = NULL;
- setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
+ setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+ &xd->scale_factor[0], &xd->scale_factor_uv[0]);
+ setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+ &xd->scale_factor[1], &xd->scale_factor_uv[1]);
xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
}
static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
- int *rate, int64_t *dist) {
+ int *rate, int64_t *dist, int do_recon) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
int bwl = b_width_log2(m->mbmi.sb_type);
int bhl = b_height_log2(m->mbmi.sb_type);
int bsl = b_width_log2(bsize);
- int bh = (1 << bhl);
int bs = (1 << bsl);
+ int bh = (1 << bhl);
+ int ms = bs / 2;
+ int mh = bh / 2;
int bss = (1 << bsl) / 4;
int i, pl;
PARTITION_TYPE partition;
BLOCK_SIZE_TYPE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
- int r = 0;
- int64_t d = 0;
+ int last_part_rate = INT_MAX;
+ int64_t last_part_dist = INT_MAX;
+ int split_rate = INT_MAX;
+ int64_t split_dist = INT_MAX;
+ int none_rate = INT_MAX;
+ int64_t none_dist = INT_MAX;
+ int chosen_rate = INT_MAX;
+ int64_t chosen_dist = INT_MAX;
+ BLOCK_SIZE_TYPE sub_subsize = BLOCK_SIZE_AB4X4;
+ int splits_below = 0;
+ BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
} else {
*(get_sb_partitioning(x, bsize)) = subsize;
}
- pl = partition_plane_context(xd, bsize);
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (cpi->sf.adjust_partitioning_from_last_frame) {
+ // Check if any of the sub blocks are further split.
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
+ sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+ splits_below = 1;
+ for (i = 0; i < 4; i++) {
+ int jj = i >> 1, ii = i & 0x01;
+ if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize) {
+ splits_below = 0;
+ }
+ }
+ }
+
+ // If partition is not none try none unless each of the 4 splits are split
+ // even further..
+ if (partition != PARTITION_NONE && !splits_below &&
+ mi_row + (ms >> 1) < cm->mi_rows &&
+ mi_col + (ms >> 1) < cm->mi_cols) {
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &none_rate, &none_dist, bsize,
+ get_block_context(x, bsize));
+
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ m->mbmi.sb_type = bs_type;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ }
+
switch (partition) {
case PARTITION_NONE:
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
- get_block_context(x, bsize));
- r += x->partition_cost[pl][PARTITION_NONE];
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &last_part_rate, &last_part_dist,
+ bsize, get_block_context(x, bsize));
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ last_part_rate += x->partition_cost[pl][PARTITION_NONE];
break;
case PARTITION_HORZ:
*(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- if (mi_row + (bh >> 1) <= cm->mi_rows) {
- int rt;
- int64_t dt;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &last_part_rate, &last_part_dist,
+ subsize, get_block_context(x, subsize));
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+ int rt = 0;
+ int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &rt, &dt, subsize,
get_block_context(x, subsize));
- r += rt;
- d += dt;
+ last_part_rate += rt;
+ last_part_dist += dt;
}
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_HORZ];
+ last_part_rate += x->partition_cost[pl][PARTITION_HORZ];
break;
case PARTITION_VERT:
*(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- if (mi_col + (bs >> 1) <= cm->mi_cols) {
- int rt;
- int64_t dt;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &last_part_rate, &last_part_dist,
+ subsize, get_block_context(x, subsize));
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ int rt = 0;
+ int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &rt, &dt, subsize,
get_block_context(x, subsize));
- r += rt;
- d += dt;
+ last_part_rate += rt;
+ last_part_dist += dt;
}
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_VERT];
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ last_part_rate += x->partition_cost[pl][PARTITION_VERT];
break;
case PARTITION_SPLIT:
+ // Split partition.
+ last_part_rate = 0;
+ last_part_dist = 0;
for (i = 0; i < 4; i++) {
- int x_idx = (i & 1) * (bs >> 2);
- int y_idx = (i >> 1) * (bs >> 2);
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
int jj = i >> 1, ii = i & 0x01;
int rt;
int64_t dt;
*(get_sb_index(xd, subsize)) = i;
rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
- mi_col + x_idx, subsize, &rt, &dt);
- r += rt;
- d += dt;
+ mi_col + x_idx, subsize, &rt, &dt, i != 3);
+ last_part_rate += rt;
+ last_part_dist += dt;
}
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_SPLIT];
+ last_part_rate += x->partition_cost[pl][PARTITION_SPLIT];
break;
default:
assert(0);
}
+ if (cpi->sf.adjust_partitioning_from_last_frame
+ && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8
+ && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
+ && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
+ BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ split_rate = 0;
+ split_dist = 0;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ // Split partition.
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * (bs >> 2);
+ int y_idx = (i >> 1) * (bs >> 2);
+ int rt = 0;
+ int64_t dt = 0;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+
+ if ((mi_row + y_idx >= cm->mi_rows)
+ || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ *(get_sb_index(xd, split_subsize)) = i;
+ *(get_sb_partitioning(x, bsize)) = split_subsize;
+ *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, tp, &rt, &dt,
+ split_subsize, get_block_context(x, split_subsize));
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (rt < INT_MAX && dt < INT_MAX && i != 3)
+ encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ split_subsize);
+
+ split_rate += rt;
+ split_dist += dt;
+ set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx);
+ pl = partition_plane_context(xd, bsize);
+ split_rate += x->partition_cost[pl][PARTITION_NONE];
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ split_rate += x->partition_cost[pl][PARTITION_SPLIT];
+
+ chosen_rate = split_rate;
+ chosen_dist = split_dist;
+ }
+
+ // If last_part is better set the partitioning to that...
+ if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
+ < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+ m->mbmi.sb_type = bsize;
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ chosen_rate = last_part_rate;
+ chosen_dist = last_part_dist;
+ }
+ // If none was better set the partitioning to that...
+ if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
+ > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ chosen_rate = none_rate;
+ chosen_dist = none_dist;
+ }
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (r < INT_MAX && d < INT_MAX)
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+
+ if (do_recon)
encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
- *rate = r;
- *dist = d;
+ *rate = chosen_rate;
+ *dist = chosen_dist;
}
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
- int64_t *dist) {
+ int64_t *dist, int do_recon) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
*(get_sb_index(xd, subsize)) = i;
rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
- &d);
+ &d, i != 3);
r4 += r;
d4 += d;
if (!cpi->sf.use_partitions_less_than
|| (cpi->sf.use_partitions_less_than
&& bsize <= cpi->sf.less_than_block_size)) {
- // PARTITION_HORZ
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
- int r2, r = 0;
- int64_t d2, d = 0;
- subsize = get_subsize(bsize, PARTITION_HORZ);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
-
- if (mi_row + (ms >> 1) < cm->mi_rows) {
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2)
- < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- }
-
- // PARTITION_VERT
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
- int r2;
- int64_t d2;
- subsize = get_subsize(bsize, PARTITION_VERT);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
- if (mi_col + (ms >> 1) < cm->mi_cols) {
- int r = 0;
- int64_t d = 0;
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
- get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2)
- < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- }
-
+ int larger_is_better = 0;
// PARTITION_NONE
if ((mi_row + (ms >> 1) < cm->mi_rows) &&
(mi_col + (ms >> 1) < cm->mi_cols)) {
< RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
srate = r;
sdist = d;
+ larger_is_better = 1;
if (bsize >= BLOCK_SIZE_SB8X8)
*(get_sb_partitioning(x, bsize)) = bsize;
}
}
+ if (!cpi->sf.use_square_partition_only &&
+ (!cpi->sf.less_rectangular_check ||!larger_is_better)) {
+ // PARTITION_HORZ
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ int r2, r = 0;
+ int64_t d2, d = 0;
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+
+ if (mi_row + (ms >> 1) < cm->mi_rows) {
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_HORZ];
+ if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // PARTITION_VERT
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+ int r2;
+ int64_t d2;
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+ if (mi_col + (ms >> 1) < cm->mi_cols) {
+ int r = 0;
+ int64_t d = 0;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_VERT];
+ if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+ }
}
*rate = srate;
*dist = sdist;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (srate < INT_MAX && sdist < INT_MAX)
+ if (srate < INT_MAX && sdist < INT_MAX && do_recon)
encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
if (bsize == BLOCK_SIZE_SB64X64) {
}
}
+// Examines 64x64 block and chooses a best reference frame
+static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+ int mi_col, int *rate, int64_t *dist) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
+ int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl;
+ int ms = bs / 2;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ int pl;
+ int r;
+ int64_t d;
+
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+ // Default is non mask (all reference frames allowed.
+ cpi->ref_frame_mask = 0;
+
+ // Do RD search for 64x64.
+ if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+ (mi_col + (ms >> 1) < cm->mi_cols)) {
+ cpi->set_ref_frame_mask = 1;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, BLOCK_SIZE_SB64X64,
+ get_block_context(x, BLOCK_SIZE_SB64X64));
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+ r += x->partition_cost[pl][PARTITION_NONE];
+
+ *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64;
+ cpi->set_ref_frame_mask = 0;
+ }
+
+ *rate = r;
+ *dist = d;
+ // RDCOST(x->rdmult, x->rddiv, r, d)
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+ /*if (srate < INT_MAX && sdist < INT_MAX)
+ encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64);
+
+ if (bsize == BLOCK_SIZE_SB64X64) {
+ assert(tp_orig < *tp);
+ assert(srate < INT_MAX);
+ assert(sdist < INT_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+ */
+}
+
static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
int *totalrate) {
VP9_COMMON * const cm = &cpi->common;
// Code each SB in the row
for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
- mi_col += 64 / MI_SIZE) {
+ mi_col += MI_BLOCK_SIZE) {
int dummy_rate;
int64_t dummy_dist;
+
+ // Initialize a mask of modes that we will not consider;
+ // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden)
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->unused_mode_skip_mask = 0;
+ else
+ cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
+
+ if (cpi->sf.reference_masking) {
+ rd_pick_reference_frame(cpi, tp, mi_row, mi_col,
+ &dummy_rate, &dummy_dist);
+ }
+
if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
set_partitioning(cpi, m, cpi->sf.always_this_block_size);
rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ &dummy_rate, &dummy_dist, 1);
} else if (cpi->sf.partition_by_variance) {
choose_partitioning(cpi, cm->mi, mi_row, mi_col);
rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ &dummy_rate, &dummy_dist, 1);
} else {
- if ((cpi->common.current_video_frame & 1) == 0 || cm->prev_mi == 0
+ if ((cpi->common.current_video_frame
+ % cpi->sf.last_partitioning_redo_frequency) == 0
+ || cm->prev_mi == 0
|| cpi->common.show_frame == 0
|| cpi->common.frame_type == KEY_FRAME
|| cpi->is_src_frame_alt_ref) {
rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ &dummy_rate, &dummy_dist, 1);
} else {
copy_partitioning(cpi, m, p);
rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ &dummy_rate, &dummy_dist, 1);
}
}
} else {
rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ &dummy_rate, &dummy_dist, 1);
}
}
}
static void init_encode_frame_mb_context(VP9_COMP *cpi) {
- MACROBLOCK * const x = &cpi->mb;
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCKD * const xd = &x->e_mbd;
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
x->act_zbin_adj = 0;
cpi->seg0_idx = 0;
vp9_setup_src_planes(x, cpi->Source, 0, 0);
// TODO(jkoleszar): are these initializations required?
- setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
- 0, 0, NULL, NULL );
+ setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
+ 0, 0, NULL, NULL);
setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+ setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
xd->mode_info_context->mbmi.mode = DC_PRED;
xd->mode_info_context->mbmi.uv_mode = DC_PRED;
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(
- cm->above_context[0], 0,
- sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
+ vpx_memset(cm->above_context[0], 0,
+ sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
vpx_memset(cm->above_seg_context, 0,
- sizeof(PARTITION_CONTEXT) * mi_cols_aligned_to_sb(cm));
+ sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
}
static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
if (lossless) {
+ // printf("Switching to lossless\n");
cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
cpi->zbin_mode_boost_enabled = 0;
cpi->common.txfm_mode = ONLY_4X4;
} else {
+ // printf("Not lossless\n");
cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
}
}
+static void switch_txfm_mode(VP9_COMP *cpi) {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cpi->common.txfm_mode >= ALLOW_32X32)
+ cpi->common.txfm_mode = ALLOW_32X32;
+}
+
static void encode_frame_internal(VP9_COMP *cpi) {
int mi_row;
MACROBLOCK * const x = &cpi->mb;
cpi->inter_zz_count = 0;
vp9_zero(cm->fc.switchable_interp_count);
- vp9_zero(cpi->best_switchable_interp_count);
+ vp9_zero(cpi->txfm_stepdown_count);
xd->mode_info_context = cm->mi;
xd->prev_mode_info_context = cm->prev_mi;
vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
vp9_initialize_me_consts(cpi, cm->base_qindex);
+ switch_txfm_mode(cpi);
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
// Initialize encode frame context.
init_encode_frame_mb_context(cpi);
vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
+ vp9_zero(cpi->rd_filter_diff);
vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
}
}
+static int get_frame_type(VP9_COMP *cpi) {
+ int frame_type;
+ if (cpi->common.frame_type == KEY_FRAME)
+ frame_type = 0;
+ else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
+ frame_type = 3;
+ else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+ frame_type = 1;
+ else
+ frame_type = 2;
+ return frame_type;
+}
+
+static void select_txfm_mode(VP9_COMP *cpi) {
+ if (cpi->oxcf.lossless) {
+ cpi->common.txfm_mode = ONLY_4X4;
+ } else if (cpi->common.current_video_frame == 0) {
+ cpi->common.txfm_mode = TX_MODE_SELECT;
+ } else {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+ cpi->common.txfm_mode = ALLOW_32X32;
+ } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ int frame_type = get_frame_type(cpi);
+ cpi->common.txfm_mode =
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+ > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+ ALLOW_32X32 : TX_MODE_SELECT;
+ } else {
+ unsigned int total = 0;
+ int i;
+ for (i = 0; i < TX_SIZE_MAX_SB; ++i)
+ total += cpi->txfm_stepdown_count[i];
+ if (total) {
+ double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+ cpi->common.txfm_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
+ // printf("fraction = %f\n", fraction);
+ } // else keep unchanged
+ }
+ }
+}
+
void vp9_encode_frame(VP9_COMP *cpi) {
VP9_COMMON * const cm = &cpi->common;
// side behaviour is where the ALT ref buffer has oppositie sign bias to
// the other two.
if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
- == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+ == cm->ref_frame_sign_bias[GOLDEN_FRAME])
|| (cm->ref_frame_sign_bias[ALTREF_FRAME]
== cm->ref_frame_sign_bias[LAST_FRAME])) {
cm->allow_comp_inter_inter = 0;
}
if (cpi->sf.RD) {
- int i, frame_type, pred_type;
- TXFM_MODE txfm_type;
-
+ int i, pred_type;
+ INTERPOLATIONFILTERTYPE filter_type;
/*
* This code does a single RD pass over the whole frame assuming
* either compound, single or hybrid prediction as per whatever has
* that for subsequent frames.
* It does the same analysis for transform size selection also.
*/
- if (cpi->common.frame_type == KEY_FRAME)
- frame_type = 0;
- else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
- frame_type = 3;
- else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
- frame_type = 1;
- else
- frame_type = 2;
+ int frame_type = get_frame_type(cpi);
/* prediction (compound, single or hybrid) mode selection */
if (frame_type == 3 || !cm->allow_comp_inter_inter)
pred_type = SINGLE_PREDICTION_ONLY;
else if (cpi->rd_prediction_type_threshes[frame_type][1]
- > cpi->rd_prediction_type_threshes[frame_type][0]
- && cpi->rd_prediction_type_threshes[frame_type][1]
- > cpi->rd_prediction_type_threshes[frame_type][2]
- && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ > cpi->rd_prediction_type_threshes[frame_type][0]
+ && cpi->rd_prediction_type_threshes[frame_type][1]
+ > cpi->rd_prediction_type_threshes[frame_type][2]
+ && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
pred_type = COMP_PREDICTION_ONLY;
else if (cpi->rd_prediction_type_threshes[frame_type][0]
- > cpi->rd_prediction_type_threshes[frame_type][2])
+ > cpi->rd_prediction_type_threshes[frame_type][2])
pred_type = SINGLE_PREDICTION_ONLY;
else
pred_type = HYBRID_PREDICTION;
+ /* filter type selection */
+ // FIXME(rbultje) for some odd reason, we often select smooth_filter
+ // as default filter for ARF overlay frames. This is a REALLY BAD
+ // IDEA so we explicitely disable it here.
+ if (frame_type != 3 &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][0] &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][2] &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[1];
+ } else if (cpi->rd_filter_threshes[frame_type][2] >
+ cpi->rd_filter_threshes[frame_type][0] &&
+ cpi->rd_filter_threshes[frame_type][2] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[2];
+ } else if (cpi->rd_filter_threshes[frame_type][0] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[0];
+ } else {
+ filter_type = SWITCHABLE;
+ }
+
/* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
cpi->mb.e_mbd.lossless = 0;
if (cpi->oxcf.lossless) {
- txfm_type = ONLY_4X4;
cpi->mb.e_mbd.lossless = 1;
- } else
-#if 0
- /* FIXME (rbultje): this code is disabled until we support cost updates
- * while a frame is being encoded; the problem is that each time we
- * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
- * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
- * further behind and not being chosen for subsequent frames either. This
- * is essentially a local minimum problem that we can probably fix by
- * estimating real costs more closely within a frame, perhaps by re-
- * calculating costs on-the-fly as frame encoding progresses. */
- if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = TX_MODE_SELECT;
- } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
- && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
- ) {
- txfm_type = ONLY_4X4;
- } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = ALLOW_16X16;
- } else
- txfm_type = ALLOW_8X8;
-#else
- txfm_type =
- cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
- > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
- ALLOW_32X32 : TX_MODE_SELECT;
-#endif
- cpi->common.txfm_mode = txfm_type;
+ }
+
+ select_txfm_mode(cpi);
cpi->common.comp_pred_mode = pred_type;
+ cpi->common.mcomp_filter_type = filter_type;
encode_frame_internal(cpi);
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
+ cpi->rd_filter_threshes[frame_type][i] =
+ (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
+ }
+
for (i = 0; i < NB_TXFM_MODES; ++i) {
int64_t pd = cpi->rd_tx_select_diff[i];
int diff;
if (i == TX_MODE_SELECT)
pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
- 2048 * (TX_SIZE_MAX_SB - 1), 0);
+ 2048 * (TX_SIZE_MAX_SB - 1), 0);
diff = (int) (pd / cpi->common.MBs);
cpi->rd_tx_select_threshes[frame_type][i] += diff;
cpi->rd_tx_select_threshes[frame_type][i] /= 2;
cpi->common.txfm_mode = ALLOW_8X8;
reset_skip_txfm_size(cpi, TX_8X8);
} else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
- && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+ && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
cpi->common.txfm_mode = ONLY_4X4;
reset_skip_txfm_size(cpi, TX_4X4);
} else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
reset_skip_txfm_size(cpi, TX_16X16);
}
}
-
- // Update interpolation filter strategy for next frame.
- if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
- vp9_select_interp_filter_type(cpi);
} else {
encode_frame_internal(cpi);
}
}
-void vp9_build_block_offsets(MACROBLOCK *x) {
-}
-
static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
const MACROBLOCKD *xd = &x->e_mbd;
const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
- int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
+ int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
++cpi->y_mode_count[0][m];
}
}
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- int n;
MODE_INFO *mi = xd->mode_info_context;
MB_MODE_INFO *mbmi = &mi->mbmi;
unsigned int segment_id = mbmi->segment_id;
assert(cm->frame_type != KEY_FRAME);
- setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
+ setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+ &xd->scale_factor[0], &xd->scale_factor_uv[0]);
+ setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+ &xd->scale_factor[1], &xd->scale_factor_uv[1]);
+
vp9_build_inter_predictors_sb(
xd, mi_row, mi_col,
vp9_tokenize_sb(cpi, xd, t, !output_enabled,
(bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
} else {
- // FIXME(rbultje): not tile-aware (mi - 1)
- int mb_skip_context = (mi - 1)->mbmi.mb_skip_coeff
- + (mi - mis)->mbmi.mb_skip_coeff;
+ int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0;
+ mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff;
mbmi->mb_skip_coeff = 1;
if (output_enabled)
// copy skip flag on all mb_mode_info contexts in this SB
// if this was a skip at this txfm size
- for (n = 1; n < bw * bh; n++) {
- const int x_idx = n & (bw - 1), y_idx = n >> bwl;
- if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
- mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
- }
+ vp9_set_pred_flag(xd, bsize, PRED_MBSKIP, mi->mbmi.mb_skip_coeff);
if (output_enabled) {
if (cm->txfm_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_SIZE_SB8X8
struct macroblock;
struct yv12_buffer_config;
-void vp9_build_block_offsets(struct macroblock *x);
-
void vp9_setup_src_planes(struct macroblock *x,
const struct yv12_buffer_config *src,
int mb_row, int mb_col);
(void) cpi;
mbmi->mode = DC_PRED;
mbmi->ref_frame[0] = INTRA_FRAME;
- if (use_16x16_pred) {
- mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8;
- vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
- } else {
- mbmi->txfm_size = TX_4X4;
- vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
- }
-
+ mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ?
+ TX_16X16 : TX_8X8) : TX_4X4;
+ vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
return vp9_get_mb_ss(x->plane[0].src_diff);
}
#include "vp9/encoder/vp9_onyx_int.h"
int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg);
void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
BLOCK_SIZE_TYPE bs);
void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
typedef struct vp9_token_state vp9_token_state;
struct vp9_token_state {
// This function is a place holder for now but may ultimately need
// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int *scan,
- const int *nb,
+static int trellis_get_coeff_context(const int16_t *scan,
+ const int16_t *nb,
int idx, int token,
- uint8_t *token_cache,
- int pad, int l) {
+ uint8_t *token_cache) {
int bak = token_cache[scan[idx]], pt;
token_cache[scan[idx]] = vp9_pt_energy_class[token];
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
+ pt = get_coef_context(nb, token_cache, idx + 1);
token_cache[scan[idx]] = bak;
return pt;
}
int best, band, pt;
PLANE_TYPE type = xd->plane[plane].plane_type;
int err_mult = plane_rd_mult[type];
- int default_eob, pad;
- int const *scan, *nb;
+ int default_eob;
+ const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
break;
}
case TX_8X8: {
- const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
+ const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
scan = get_scan_8x8(tx_type);
default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
- const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
+ const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
scan = get_scan_16x16(tx_type);
default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
rdmult = (rdmult * 9) >> 4;
rddiv = mb->rddiv;
- memset(best_index, 0, sizeof(best_index));
/* Initialize the sentinel node of the trellis. */
tokens[eob][0].rate = 0;
tokens[eob][0].error = 0;
for (i = 0; i < eob; i++)
token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
qcoeff_ptr[scan[i]]].token];
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
for (i = eob; i-- > i0;) {
int base_bits, d2, dx;
/* Consider both possible successor states. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
- mb->token_costs_noskip[tx_size][type][ref][band][pt]
- [tokens[next][0].token];
+ mb->token_costs[tx_size][type][ref][0][band][pt]
+ [tokens[next][0].token];
rate1 +=
- mb->token_costs_noskip[tx_size][type][ref][band][pt]
- [tokens[next][1].token];
+ mb->token_costs[tx_size][type][ref][0][band][pt]
+ [tokens[next][1].token];
}
UPDATE_RD_COST();
/* And pick the best. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
if (t0 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
- if (!x)
- rate0 += mb->token_costs[tx_size][type][ref][band][pt][
- tokens[next][0].token];
- else
- rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
- tokens[next][0].token];
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+ rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+ [tokens[next][0].token];
}
if (t1 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
- pad, default_eob);
- if (!x)
- rate1 += mb->token_costs[tx_size][type][ref][band][pt][
- tokens[next][1].token];
- else
- rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
- tokens[next][1].token];
+ pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
+ rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+ [tokens[next][1].token];
}
}
/* Update the cost of each path if we're past the EOB token. */
if (t0 != DCT_EOB_TOKEN) {
tokens[next][0].rate +=
- mb->token_costs[tx_size][type][ref][band][0][t0];
+ mb->token_costs[tx_size][type][ref][1][band][0][t0];
tokens[next][0].token = ZERO_TOKEN;
}
if (t1 != DCT_EOB_TOKEN) {
tokens[next][1].rate +=
- mb->token_costs[tx_size][type][ref][band][0][t1];
+ mb->token_costs[tx_size][type][ref][1][band][0][t1];
tokens[next][1].token = ZERO_TOKEN;
}
+ best_index[i][0] = best_index[i][1] = 0;
/* Don't update next, because we didn't add a new node. */
}
}
error1 = tokens[next][1].error;
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
- rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
- rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
+ rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
+ rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
UPDATE_RD_COST();
best = rd_cost1 < rd_cost0;
final_eob = i0 - 1;
foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
}
-struct encode_b_args {
- VP9_COMMON *cm;
- MACROBLOCK *x;
- struct optimize_ctx *ctx;
-};
-
-static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
vp9_short_fdct32x32(src_diff, coeff, bw * 2);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
if (tx_type != DCT_DCT)
vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
else
x->fwd_txm16x16(src_diff, coeff, bw * 2);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
if (tx_type != DCT_DCT)
vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
else
vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
if (tx_type == DCT_DCT)
vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
else
vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
if (tx_type == DCT_DCT)
vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
else
foreach_transformed_block(xd, bsize, encode_block, &arg);
}
-static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
if (plane == 0 &&
mbmi->sb_type < BLOCK_SIZE_SB8X8 &&
mbmi->ref_frame[0] == INTRA_FRAME)
- b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
+ b_mode = xd->mode_info_context->bmi[ib].as_mode;
else
b_mode = mode;
plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+ dst, pd->dst.stride,
dst, pd->dst.stride);
vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
src, p->src.stride, dst, pd->dst.stride);
vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
if (tx_type == DCT_DCT)
vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
else
vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
if (tx_type == DCT_DCT)
vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
else
ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
};
+struct encode_b_args {
+ VP9_COMMON *cm;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+};
+
void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
struct optimize_ctx *ctx);
void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg);
void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
}
}
-static int update_nmv(
- vp9_writer *const bc,
- const unsigned int ct[2],
- vp9_prob *const cur_p,
- const vp9_prob new_p,
- const vp9_prob upd_p) {
+static int update_mv(vp9_writer *bc, const unsigned int ct[2],
+ vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
#ifdef LOW_PRECISION_MV_UPDATE
vp9_prob mod_p = new_p | 1;
}
}
+static void counts_to_nmv_context(
+ nmv_context_counts *nmv_count,
+ nmv_context *prob,
+ int usehp,
+ unsigned int (*branch_ct_joint)[2],
+ unsigned int (*branch_ct_sign)[2],
+ unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+ unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+ unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+ unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+ unsigned int (*branch_ct_fp)[4 - 1][2],
+ unsigned int (*branch_ct_class0_hp)[2],
+ unsigned int (*branch_ct_hp)[2]) {
+ int i, j, k;
+ vp9_counts_process(nmv_count, usehp);
+ vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
+ prob->joints,
+ branch_ct_joint,
+ nmv_count->joints, 0);
+ for (i = 0; i < 2; ++i) {
+ const uint32_t s0 = nmv_count->comps[i].sign[0];
+ const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+ prob->comps[i].sign = get_binary_prob(s0, s1);
+ branch_ct_sign[i][0] = s0;
+ branch_ct_sign[i][1] = s1;
+ vp9_tree_probs_from_distribution(vp9_mv_class_tree,
+ prob->comps[i].classes,
+ branch_ct_classes[i],
+ nmv_count->comps[i].classes, 0);
+ vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
+ prob->comps[i].class0,
+ branch_ct_class0[i],
+ nmv_count->comps[i].class0, 0);
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+ const uint32_t b1 = nmv_count->comps[i].bits[j][1];
+
+ prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+ branch_ct_bits[i][j][0] = b0;
+ branch_ct_bits[i][j][1] = b1;
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (k = 0; k < CLASS0_SIZE; ++k) {
+ vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+ prob->comps[i].class0_fp[k],
+ branch_ct_class0_fp[i][k],
+ nmv_count->comps[i].class0_fp[k], 0);
+ }
+ vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+ prob->comps[i].fp,
+ branch_ct_fp[i],
+ nmv_count->comps[i].fp, 0);
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+ const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+ const uint32_t hp0 = nmv_count->comps[i].hp[0];
+ const uint32_t hp1 = nmv_count->comps[i].hp[1];
+
+ prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+ branch_ct_class0_hp[i][0] = c0_hp0;
+ branch_ct_class0_hp[i][1] = c0_hp1;
+
+ prob->comps[i].hp = get_binary_prob(hp0, hp1);
+ branch_ct_hp[i][0] = hp0;
+ branch_ct_hp[i][1] = hp1;
+ }
+ }
+}
+
+
void print_nmvcounts(nmv_context_counts tnmvcounts) {
int i, j, k;
printf("\nCounts =\n { ");
unsigned int branch_ct_class0_hp[2][2];
unsigned int branch_ct_hp[2][2];
int i, j, k;
- vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
+ counts_to_nmv_context(&tnmvcounts, &prob, 1,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
printf("\nCounts =\n { ");
for (j = 0; j < MV_JOINTS; ++j)
unsigned int branch_ct_fp[2][4 - 1][2];
unsigned int branch_ct_class0_hp[2][2];
unsigned int branch_ct_hp[2][2];
+ nmv_context *mvc = &cpi->common.fc.nmvc;
+
#ifdef MV_GROUP_UPDATE
int savings = 0;
#endif
if (!cpi->dummy_packing)
add_nmvcount(&tnmvcounts, &cpi->NMVcount);
#endif
- vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
+ counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
/* write updates if they help */
#ifdef MV_GROUP_UPDATE
for (j = 0; j < MV_JOINTS - 1; ++j) {
vp9_write_bit(bc, 1);
#endif
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- update_nmv(bc, branch_ct_joint[j],
- &cpi->common.fc.nmvc.joints[j],
- prob.joints[j],
- VP9_NMV_UPDATE_PROB);
- }
+ for (j = 0; j < MV_JOINTS - 1; ++j)
+ update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
+ VP9_NMV_UPDATE_PROB);
+
for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_sign[i],
- &cpi->common.fc.nmvc.comps[i].sign,
- prob.comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- update_nmv(bc, branch_ct_classes[i][j],
- &cpi->common.fc.nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- update_nmv(bc, branch_ct_class0[i][j],
- &cpi->common.fc.nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- update_nmv(bc, branch_ct_bits[i][j],
- &cpi->common.fc.nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
+ update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
+ prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+ for (j = 0; j < MV_CLASSES - 1; ++j)
+ update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
+ prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+
+ for (j = 0; j < CLASS0_SIZE - 1; ++j)
+ update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
+ prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
+ prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
}
+
for (i = 0; i < 2; ++i) {
for (j = 0; j < CLASS0_SIZE; ++j) {
int k;
- for (k = 0; k < 3; ++k) {
- update_nmv(bc, branch_ct_class0_fp[i][j][k],
- &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (j = 0; j < 3; ++j) {
- update_nmv(bc, branch_ct_fp[i][j],
- &cpi->common.fc.nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
+ for (k = 0; k < 3; ++k)
+ update_mv(bc, branch_ct_class0_fp[i][j][k],
+ &mvc->comps[i].class0_fp[j][k],
+ prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
}
+
+ for (j = 0; j < 3; ++j)
+ update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
+ prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
}
+
if (usehp) {
for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_class0_hp[i],
- &cpi->common.fc.nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- update_nmv(bc, branch_ct_hp[i],
- &cpi->common.fc.nmvc.comps[i].hp,
- prob.comps[i].hp,
- VP9_NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
+ prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+ prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
}
}
}
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+ const MV* mv, const MV* ref,
const nmv_context* mvctx, int usehp) {
const MV diff = {mv->row - ref->row,
mv->col - ref->col};
if (mv_joint_horizontal(j))
encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled and it is an arf/non shown frame
+ // then keep track of the largest motion vector component used.
+ if (cpi->sf.auto_mv_step_size && !cpi->common.show_frame) {
+ cpi->max_mv_magnitude = MAX((MAX(abs(mv->row), abs(mv->col)) >> 3),
+ cpi->max_mv_magnitude);
+ }
}
void vp9_build_nmv_cost_table(int *mvjoint,
if (pi->bmi[i].mode == NEWMV) {
mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+ x->e_mbd.allow_high_precision_mv);
if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
mv.row = pi->bmi[i].second_mv.as_mv.row -
second_best_ref_mv->as_mv.row;
mv.col = pi->bmi[i].second_mv.as_mv.col -
second_best_ref_mv->as_mv.col;
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+ x->e_mbd.allow_high_precision_mv);
}
}
}
}
} else if (mbmi->mode == NEWMV) {
- mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+ mv.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
+ mv.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
+ vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
x->e_mbd.allow_high_precision_mv);
if (mbmi->ref_frame[1] > INTRA_FRAME) {
- mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ mv.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
+ mv.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
+ vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+ x->e_mbd.allow_high_precision_mv);
}
}
}
void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
const nmv_context* mvctx, int usehp);
void vp9_build_nmv_cost_table(int *mvjoint,
vp9_clear_system_state(); // __asm emms;
vp9_setup_src_planes(x, cpi->Source, 0, 0);
- setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);
+ setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL, NULL);
setup_dst_planes(xd, new_yv12, 0, 0);
x->partition_info = x->pi;
xd->mode_info_context = cm->mi;
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+ setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
vp9_frame_init_quantizer(cpi);
int_mv ref_full;
// Further step/diamond searches as necessary
- int step_param = cpi->sf.first_step +
+ int step_param = cpi->sf.reduce_first_step_size +
(cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
+ step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
vp9_clamp_mv_min_max(x, ref_mv);
int *arf_not_zz;
- CHECK_MEM_ERROR(arf_not_zz,
- vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+ CHECK_MEM_ERROR(cm, arf_not_zz,
+ vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz),
+ 1));
// We are not interested in results beyond the alt ref itself.
if (n_frames > cpi->frames_till_gf_update_due)
x->mv_row_max = row_max;
}
-int vp9_init_search_range(int width, int height) {
+int vp9_init_search_range(VP9_COMP *cpi, int size) {
int sr = 0;
- int frm = MIN(width, height);
- while ((frm << sr) < MAX_FULL_PEL_VAL)
+ // Minimum search size no matter what the passed in value.
+ size = MAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL)
sr++;
if (sr)
sr--;
+ sr += cpi->sf.reduce_first_step_size;
+ sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
return sr;
}
int offset;
int usehp = xd->allow_high_precision_mv;
- uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
- vpx_free(comp_pred);
-
if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
int *mvjsadcost = x->nmvjointsadcost;
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
- /* Compound pred buffer */
- uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
-
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
/* Get compound pred by averaging two pred blocks. */
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-
- bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+ bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
+ second_pred, 0x7fffffff) +
mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
best_address;
/* Get compound block and use it to calculate SAD. */
- comp_avg_pred(comp_pred, second_pred, w, h, check_here,
- in_what_stride);
- thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+ thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
+ second_pred, bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = ref_mv->as_mv.col << 3;
if (bestsad < INT_MAX) {
- int besterr;
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
- besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
- (unsigned int *)(&thissad)) +
+ // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
+ // so we don't have to use the subpixel with xoff=0,yoff=0 here.
+ int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+ what, what_stride, (unsigned int *)(&thissad),
+ second_pred) +
mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
xd->allow_high_precision_mv);
- vpx_free(comp_pred);
return besterr;
} else {
- vpx_free(comp_pred);
return INT_MAX;
}
}
#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-int vp9_init_search_range(int width, int height);
-
int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
int *mvcost[2], int weight, int ishp);
void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
void vp9_init3smotion_compensation(MACROBLOCK *x, int stride);
-// Runs sequence of diamond searches in smaller steps for RD
struct VP9_COMP;
+int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+
+// Runs sequence of diamond searches in smaller steps for RD
int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
int_mv *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
for (i = 0; i < MAX_MODES; ++i)
sf->thresh_mult[i] = mode == 0 ? -500 : 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_ZEROMV] = 0;
+ sf->thresh_mult[THR_ZEROG] = 0;
+ sf->thresh_mult[THR_ZEROA] = 0;
sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARESTG] = 0;
+ sf->thresh_mult[THR_NEARESTA] = 0;
- sf->thresh_mult[THR_NEARMV ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARG ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARA ] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000;
sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_TM] += speed_multiplier * 1000;
sf->thresh_mult[THR_V_PRED ] += speed_multiplier * 1000;
sf->thresh_mult[THR_H_PRED ] += speed_multiplier * 1000;
sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500;
sf->thresh_mult[THR_SPLITG ] += speed_multiplier * 2500;
sf->thresh_mult[THR_SPLITA ] += speed_multiplier * 2500;
- sf->thresh_mult[THR_COMP_ZEROLA ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_ZEROGA ] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 1500;
sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;
sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEARLA ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEARGA ] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500;
sf->thresh_mult[THR_COMP_NEWLA ] += speed_multiplier * 2000;
sf->thresh_mult[THR_COMP_NEWGA ] += speed_multiplier * 2000;
sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX;
sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
}
+
+ if (sf->disable_splitmv == 1) {
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+
+ sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
+ }
}
void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->mode_chosen_counts[i] = 0;
}
+ // Initialize cpi->max_mv_magnitude if appropriate.
+ if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only ||
+ (cpi->common.show_frame == 0)) {
+ cpi->max_mv_magnitude = 0;
+ }
+
// best quality defaults
sf->RD = 1;
sf->search_method = NSTEP;
sf->half_pixel_search = 1;
sf->iterative_sub_pixel = 1;
sf->optimize_coefficients = !cpi->oxcf.lossless;
- sf->first_step = 0;
+ sf->reduce_first_step_size = 0;
+ sf->auto_mv_step_size = 0;
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
- sf->adpative_rd_thresh = 0;
+ sf->adaptive_rd_thresh = 0;
sf->use_lastframe_partitioning = 0;
- sf->use_largest_txform = 0;
+ sf->tx_size_search_method = USE_FULL_RD;
sf->use_8tap_always = 0;
sf->use_avoid_tested_higherror = 0;
+ sf->reference_masking = 0;
sf->skip_lots_of_modes = 0;
sf->adjust_thresholds_by_speed = 0;
sf->partition_by_variance = 0;
sf->use_one_partition_size_always = 0;
+ sf->less_rectangular_check = 0;
+ sf->use_square_partition_only = 0;
sf->use_partitions_less_than = 0;
sf->less_than_block_size = BLOCK_SIZE_MB16X16;
sf->use_partitions_greater_than = 0;
sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+ sf->adjust_partitioning_from_last_frame = 0;
+ sf->last_partitioning_redo_frequency = 4;
+ sf->disable_splitmv = 0;
+ sf->mode_search_skip_flags = 0;
+
+ // Skip any mode not chosen at size < X for all sizes > X
+ // Hence BLOCK_SIZE_SB64X64 (skip is off)
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64;
#if CONFIG_MULTIPLE_ARF
// Switch segmentation off.
#else
sf->static_segmentation = 0;
#endif
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
- sf->adpative_rd_thresh = 1;
+ sf->auto_mv_step_size = 1;
+ sf->use_avoid_tested_higherror = 1;
+ sf->adaptive_rd_thresh = 1;
+
if (speed == 1) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
- sf->optimize_coefficients = 0;
- sf->first_step = 1;
- sf->use_avoid_tested_higherror = 1;
- sf->adjust_thresholds_by_speed = 1;
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0);
+ sf->disable_splitmv =
+ (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA;
}
if (speed == 2) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->adjust_thresholds_by_speed = 1;
+ sf->less_rectangular_check = 1;
+ sf->use_square_partition_only = 1;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
sf->use_lastframe_partitioning = 1;
- sf->first_step = 0;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+ sf->reduce_first_step_size = 1;
+ sf->optimize_coefficients = 0;
+ // sf->reference_masking = 1;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
}
if (speed == 3) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
sf->partition_by_variance = 1;
- sf->first_step = 0;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->reduce_first_step_size = 1;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
}
if (speed == 4) {
- sf->first_step = 0;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
sf->use_one_partition_size_always = 1;
sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
}
+ /*
if (speed == 2) {
sf->first_step = 0;
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
sf->use_partitions_greater_than = 1;
sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
}
+ */
- break;
+ break;
}; /* switch */
// Set rd thresholds based on mode and speed setting
- if(cpi->sf.adjust_thresholds_by_speed)
+ if (cpi->sf.adjust_thresholds_by_speed)
set_rd_speed_thresholds(cpi, mode, speed);
else
set_rd_speed_thresholds(cpi, mode, 0);
static int alloc_partition_data(VP9_COMP *cpi) {
vpx_free(cpi->mb.pip);
- cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) *
- (cpi->common.mi_rows + 64 / MI_SIZE),
+ cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
+ (cpi->common.mi_rows + MI_BLOCK_SIZE),
sizeof(PARTITION_INFO));
if (!cpi->mb.pip)
return 1;
{
unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
- CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
// Data used for real time vc mode to see if gf needs refreshing
cpi->gf_update_recommended = 0;
vpx_free(cpi->mb_activity_map);
- CHECK_MEM_ERROR(cpi->mb_activity_map,
+ CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
vpx_free(cpi->mb_norm_activity_map);
- CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
+ CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
}
return 0;
}
- cpi->common.error.setjmp = 1;
+ cm->error.setjmp = 1;
- CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+ CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
+ (MAX_MVSEARCH_STEPS * 8) + 1));
- vp9_create_common(&cpi->common);
+ vp9_create_common(cm);
init_config((VP9_PTR)cpi, oxcf);
- cpi->common.current_video_frame = 0;
+ cm->current_video_frame = 0;
cpi->kf_overspend_bits = 0;
cpi->kf_bitrate_adjustment = 0;
cpi->frames_till_gf_update_due = 0;
cpi->non_gf_bitrate_adjustment = 0;
// Set reference frame sign bias for ALTREF frame to 1 (for now)
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+ cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
cpi->gold_is_alt = 0;
// Create the encoder segmentation map and set all entries to 0
- CHECK_MEM_ERROR(cpi->segmentation_map,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a copy in common for temporal coding
- CHECK_MEM_ERROR(cm->last_frame_seg_map,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cm->last_frame_seg_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a place holder structure is the coding context
// for use if we want to save and restore it
- CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
- CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
- vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+ CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
+ vpx_memset(cpi->active_map, 1, cm->MBs);
cpi->active_map_enabled = 0;
for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
sizeof(cpi->mbgraph_stats[0])); i++) {
- CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
- vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
- sizeof(*cpi->mbgraph_stats[i].mb_stats),
- 1));
+ CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+ vpx_calloc(cm->MBs *
+ sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
}
#ifdef ENTROPY_STATS
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
-#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
+ SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
- BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+ BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+ vp9_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
- BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+ BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+ vp9_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
- BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+ BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+ vp9_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
- BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+ BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+ vp9_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
- BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+ BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+ vp9_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
- BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+ BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+ vp9_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
- BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+ BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+ vp9_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
vp9_variance_halfpixvar16x16_v,
vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
vp9_sad16x16x4d)
- BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+ BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+ vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
- BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+ BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+ vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
- BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+ BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+ vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
- BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+ BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+ vp9_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, NULL,
NULL, NULL, vp9_sad8x4x8,
vp9_sad8x4x4d)
- BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+ BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+ vp9_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, NULL,
NULL, NULL, vp9_sad4x8x8,
vp9_sad4x8x4d)
- BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+ BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+ vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
struct vpx_codec_cx_pkt pkt;
uint64_t sse;
int i;
- unsigned int width = cpi->common.width;
- unsigned int height = cpi->common.height;
+ unsigned int width = orig->y_crop_width;
+ unsigned int height = orig->y_crop_height;
pkt.kind = VPX_CODEC_PSNR_PKT;
sse = calc_plane_error(orig->y_buffer, orig->y_stride,
pkt.data.psnr.samples[0] = width * height;
pkt.data.psnr.samples[1] = width * height;
- width = orig->uv_width;
- height = orig->uv_height;
+ width = orig->uv_crop_width;
+ height = orig->uv_crop_height;
sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
recon->u_buffer, recon->uv_stride,
}
-void vp9_select_interp_filter_type(VP9_COMP *cpi) {
- int i;
- int high_filter_index = 0;
- unsigned int thresh;
- unsigned int high_count = 0;
- unsigned int count_sum = 0;
- unsigned int *hist = cpi->best_switchable_interp_count;
-
- if (DEFAULT_INTERP_FILTER != SWITCHABLE) {
- cpi->common.mcomp_filter_type = DEFAULT_INTERP_FILTER;
- return;
- }
-
- // TODO(agrange): Look at using RD criteria to select the interpolation
- // filter to use for the next frame rather than this simpler counting scheme.
-
- // Select the interpolation filter mode for the next frame
- // based on the selection frequency seen in the current frame.
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- unsigned int count = hist[i];
- count_sum += count;
- if (count > high_count) {
- high_count = count;
- high_filter_index = i;
- }
- }
-
- thresh = (unsigned int)(0.80 * count_sum);
-
- if (high_count > thresh) {
- // One filter accounts for 80+% of cases so force the next
- // frame to use this filter exclusively using frame-level flag.
- cpi->common.mcomp_filter_type = vp9_switchable_interp[high_filter_index];
- } else {
- // Use a MB-level switchable filter selection strategy.
- cpi->common.mcomp_filter_type = SWITCHABLE;
- }
-}
-
static void scale_references(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
int i;
cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
}
+static void full_to_model_count(unsigned int *model_count,
+ unsigned int *full_count) {
+ int n;
+ model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+ model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+ model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+ for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+ model_count[TWO_TOKEN] += full_count[n];
+ model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
+static void full_to_model_counts(
+ vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+ int i, j, k, l;
+ for (i = 0; i < BLOCK_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ if (l >= 3 && k == 0)
+ continue;
+ full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+ }
+}
+
+
static void encode_frame_to_data_rate(VP9_COMP *cpi,
unsigned long *size,
unsigned char *dest,
update_reference_frames(cpi);
for (t = TX_4X4; t <= TX_32X32; t++)
- vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],
- cpi->coef_counts[t]);
+ full_to_model_counts(cpi->common.fc.coef_counts[t],
+ cpi->coef_counts[t]);
if (!cpi->common.error_resilient_mode &&
!cpi->common.frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(&cpi->common);
if (cm->show_frame) {
vpx_memcpy(cm->prev_mip, cm->mip,
- cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
+ cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) *
sizeof(MODE_INFO));
} else {
vpx_memset(cm->prev_mip, 0,
- cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
+ cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) *
sizeof(MODE_INFO));
}
// restore prev_mi
vp9_second_pass(cpi);
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
- //vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
+ // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
#ifdef DISABLE_RC_LONG_TERM_MEM
cpi->twopass.bits_left -= cpi->this_frame_target;
#else
double sq_error;
ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
- recon->y_buffer, recon->y_stride, orig->y_width,
- orig->y_height);
+ recon->y_buffer, recon->y_stride,
+ orig->y_crop_width, orig->y_crop_height);
ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
- recon->u_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
+ recon->u_buffer, recon->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
- recon->v_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
+ recon->v_buffer, recon->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
sq_error = ye + ue + ve;
vp9_clear_system_state();
ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
- pp->y_buffer, pp->y_stride, orig->y_width,
- orig->y_height);
+ pp->y_buffer, pp->y_stride,
+ orig->y_crop_width, orig->y_crop_height);
ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
- pp->u_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
+ pp->u_buffer, pp->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
- pp->v_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
+ pp->v_buffer, pp->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
sq_error = ye + ue + ve;
HEX = 2
} SEARCH_METHODS;
+typedef enum {
+ USE_FULL_RD = 0,
+ USE_LARGESTINTRA,
+ USE_LARGESTINTRA_MODELINTER,
+ USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+ // Values should be powers of 2 so that they can be selected as bits of
+ // an integer flags field
+
+ // terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1,
+
+ // skips comp inter modes if the best so far is an intra mode
+ FLAG_SKIP_COMP_BESTINTRA = 2,
+
+ // skips comp inter modes if the best single intermode so far does
+ // not have the same reference as one of the two references being
+ // tested
+ FLAG_SKIP_COMP_REFMISMATCH = 4,
+
+ // skips oblique intra modes if the best so far is an inter mode
+ FLAG_SKIP_INTRA_BESTINTER = 8,
+
+ // skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions
+ FLAG_SKIP_INTRA_DIRMISMATCH = 16,
+} MODE_SEARCH_SKIP_LOGIC;
+
typedef struct {
int RD;
SEARCH_METHODS search_method;
int quarter_pixel_search;
int thresh_mult[MAX_MODES];
int max_step_search_steps;
- int first_step;
+ int reduce_first_step_size;
+ int auto_mv_step_size;
int optimize_coefficients;
int search_best_filter;
int static_segmentation;
int comp_inter_joint_search_thresh;
- int adpative_rd_thresh;
+ int adaptive_rd_thresh;
+ int skip_encode_sb;
int use_lastframe_partitioning;
- int use_largest_txform;
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
int use_8tap_always;
int use_avoid_tested_higherror;
int skip_lots_of_modes;
int adjust_thresholds_by_speed;
int partition_by_variance;
int use_one_partition_size_always;
+ int less_rectangular_check;
+ int use_square_partition_only;
+ int unused_mode_skip_lvl;
+ int reference_masking;
BLOCK_SIZE_TYPE always_this_block_size;
int use_partitions_greater_than;
BLOCK_SIZE_TYPE greater_than_block_size;
int use_partitions_less_than;
BLOCK_SIZE_TYPE less_than_block_size;
+ int adjust_partitioning_from_last_frame;
+ int last_partitioning_redo_frequency;
+ int disable_splitmv;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
} SPEED_FEATURES;
enum BlockSize {
typedef struct VP9_COMP {
- DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
- DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
#if CONFIG_ALPHA
- DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
#endif
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
MACROBLOCK mb;
VP9_COMMON common;
unsigned int mode_check_freq[MAX_MODES];
unsigned int mode_test_hit_counts[MAX_MODES];
unsigned int mode_chosen_counts[MAX_MODES];
+ int64_t unused_mode_skip_mask;
+ int ref_frame_mask;
+ int set_ref_frame_mask;
int rd_thresh_mult[MAX_MODES];
int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
+ // FIXME(rbultje) int64_t?
int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
unsigned int single_ref_count[REF_CONTEXTS][2][2];
unsigned int comp_ref_count[REF_CONTEXTS][2];
- // FIXME contextualize
-
int64_t rd_tx_select_diff[NB_TXFM_MODES];
+ // FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][NB_TXFM_MODES];
+ int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1];
+
int RDMULT;
int RDDIV;
SPEED_FEATURES sf;
int error_bins[1024];
+ unsigned int max_mv_magnitude;
+
// Data used for real time conferencing mode to help determine if it would be good to update the gf
int inter_zz_count;
int gf_bad_count;
unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS];
- unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
+
+ unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];
int initial_width;
int initial_height;
extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
-
#endif // VP9_ENCODER_VP9_ONYX_INT_H_
extern int enc_debug;
#endif
-static INLINE int plane_idx(int plane) {
- return plane == 0 ? 0 :
- plane == 1 ? 16 : 20;
-}
-
-static void quantize(int16_t *zbin_boost_orig_ptr,
- int16_t *coeff_ptr, int n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- uint8_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr,
- const int *scan, int mul) {
+void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+ int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
- int zero_run = 0;
- int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
int zero_flag = n_coeffs;
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
// Pre-scan pass
for (i = n_coeffs - 1; i >= 0; i--) {
rc = scan[i];
- z = coeff_ptr[rc] * mul;
+ z = coeff_ptr[rc];
if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
zero_flag--;
// skippable. Note: zero_flag can be zero.
for (i = 0; i < zero_flag; i++) {
rc = scan[i];
- z = coeff_ptr[rc] * mul;
+ z = coeff_ptr[rc];
- zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
- zero_run += (zero_run < 15);
+ zbin = (zbins[rc != 0]);
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz;
if (x >= zbin) {
x += (round_ptr[rc != 0]);
- y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
- >> quant_shift_ptr[rc != 0]; // quantize (x)
+ y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+ quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
if (y) {
eob = i; // last nonzero coeffs
- zero_run = 0; // set zero_run
}
}
}
}
// This function works well for large transform size.
-static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
- int16_t *coeff_ptr, int n_coeffs, int skip_block,
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
- int16_t *quant_ptr, uint8_t *quant_shift_ptr,
+ int16_t *quant_ptr, int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr, const int *scan, int mul,
- int *idx_arr) {
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
int i, rc, eob;
- int zbins[2], pzbins[2], nzbins[2], zbin;
+ int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
- int zero_run = 0;
- int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
int idx = 0;
- int pre_idx = 0;
+ int idx_arr[1024];
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
// Base ZBIN
zbins[0] = zbin_ptr[0] + zbin_oq_value;
zbins[1] = zbin_ptr[1] + zbin_oq_value;
- // Positive and negative ZBIN
- pzbins[0] = zbins[0]/mul;
- pzbins[1] = zbins[1]/mul;
- nzbins[0] = pzbins[0] * -1;
- nzbins[1] = pzbins[1] * -1;
+ nzbins[0] = zbins[0] * -1;
+ nzbins[1] = zbins[1] * -1;
if (!skip_block) {
// Pre-scan pass
for (i = 0; i < n_coeffs; i++) {
rc = scan[i];
- z = coeff_ptr[rc];
+ z = coeff_ptr[rc] * 2;
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
- if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+ if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
idx_arr[idx++] = i;
}
rc = scan[idx_arr[i]];
// Calculate ZBIN
- zero_run += idx_arr[i] - pre_idx;
- if(zero_run > 15) zero_run = 15;
- zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+ zbin = (zbins[rc != 0]);
- pre_idx = idx_arr[i];
- z = coeff_ptr[rc] * mul;
+ z = coeff_ptr[rc] * 2;
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz; // x = abs(z)
if (x >= zbin) {
x += (round_ptr[rc != 0]);
- y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
- >> quant_shift_ptr[rc != 0]; // quantize (x)
+ y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+ quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value
if (y) {
eob = idx_arr[i]; // last nonzero coeffs
- zero_run = -1; // set zero_run
}
}
}
}
*eob_ptr = eob + 1;
}
-#if 0
-// Original quantize function
-static void quantize(int16_t *zbin_boost_orig_ptr,
- int16_t *coeff_ptr, int n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- uint8_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr,
- const int *scan, int mul) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- int zero_run = 0;
- int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
-
- vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
- eob = -1;
-
- if (!skip_block) {
- for (i = 0; i < n_coeffs; i++) {
- rc = scan[i];
- z = coeff_ptr[rc] * mul;
-
- zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
- zero_run += (zero_run < 15);
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += (round_ptr[rc != 0]);
- y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
- >> quant_shift_ptr[rc != 0]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zero_run = 0;
- }
- }
- }
- }
-
- *eob_ptr = eob + 1;
-}
-#endif
void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
TX_TYPE tx_type) {
MACROBLOCKD *const xd = &mb->e_mbd;
- const int mul = n_coeffs == 1024 ? 2 : 1;
- const int *scan;
+ const int16_t *scan, *iscan;
// These contexts may be available in the caller
switch (n_coeffs) {
case 4 * 4:
scan = get_scan_4x4(tx_type);
+ iscan = get_iscan_4x4(tx_type);
break;
case 8 * 8:
scan = get_scan_8x8(tx_type);
+ iscan = get_iscan_8x8(tx_type);
break;
case 16 * 16:
scan = get_scan_16x16(tx_type);
+ iscan = get_iscan_16x16(tx_type);
break;
default:
scan = vp9_default_scan_32x32;
+ iscan = vp9_default_iscan_32x32;
break;
}
// Call different quantization for different transform size.
if (n_coeffs >= 1024) {
// Save index of picked coefficient in pre-scan pass.
- int idx_arr[1024];
-
- quantize_sparse(mb->plane[plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, mul, idx_arr);
+ vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+ n_coeffs, mb->skip_block,
+ mb->plane[plane].zbin,
+ mb->plane[plane].round,
+ mb->plane[plane].quant,
+ mb->plane[plane].quant_shift,
+ BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+ BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+ xd->plane[plane].dequant,
+ mb->plane[plane].zbin_extra,
+ &xd->plane[plane].eobs[block],
+ scan, iscan);
}
else {
- quantize(mb->plane[plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, mul);
+ vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+ n_coeffs, mb->skip_block,
+ mb->plane[plane].zbin,
+ mb->plane[plane].round,
+ mb->plane[plane].quant,
+ mb->plane[plane].quant_shift,
+ BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+ BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+ xd->plane[plane].dequant,
+ mb->plane[plane].zbin_extra,
+ &xd->plane[plane].eobs[block],
+ scan, iscan);
}
}
int y_blocks) {
MACROBLOCKD *const xd = &mb->e_mbd;
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- const int *pt_scan = get_scan_4x4(tx_type);
+ const int16_t *scan = get_scan_4x4(tx_type);
+ const int16_t *iscan = get_iscan_4x4(tx_type);
- quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+ vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
16, mb->skip_block,
mb->plane[pb_idx.plane].zbin,
mb->plane[pb_idx.plane].round,
xd->plane[pb_idx.plane].dequant,
mb->plane[pb_idx.plane].zbin_extra,
&xd->plane[pb_idx.plane].eobs[pb_idx.block],
- pt_scan, 1);
+ scan, iscan);
}
-static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
unsigned t;
int l;
t = d;
t >>= 1;
t = 1 + (1 << (16 + l)) / d;
*quant = (int16_t)(t - (1 << 16));
- *shift = l;
+ *shift = 1 << (16 - l);
}
void vp9_init_quantizer(VP9_COMP *cpi) {
#endif
int q;
- static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12,
- 14, 16, 20, 24, 28, 32, 36, 40 };
-
for (q = 0; q < QINDEX_RANGE; q++) {
int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
int qrounding_factor = 48;
qzbin_factor = 64;
qrounding_factor = 64;
}
+
// dc values
quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.y_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.uv_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
#if CONFIG_ALPHA
quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.a_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
#endif
quant_val = vp9_ac_quant(q, 0);
+ invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val);
+ cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+ cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7;
cpi->common.y_dequant[q][1] = quant_val;
+
quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+ invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1,
+ quant_uv_val);
+ cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+ cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7;
cpi->common.uv_dequant[q][1] = quant_uv_val;
+
#if CONFIG_ALPHA
quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
+ invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1,
+ quant_alpha_val);
+ cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
+ cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7;
cpi->common.a_dequant[q][1] = quant_alpha_val;
#endif
- // all the 4x4 ac values =;
- for (i = 1; i < 16; i++) {
- int rc = vp9_default_scan_4x4[i];
-
- invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
- cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
- cpi->zrun_zbin_boost_y[q][i] =
- ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
-
- invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
- quant_uv_val);
- cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
- cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
- cpi->zrun_zbin_boost_uv[q][i] =
- ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
+
+ for (i = 2; i < 8; i++) {
+ cpi->y_quant[q][i] = cpi->y_quant[q][1];
+ cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
+ cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
+ cpi->y_round[q][i] = cpi->y_round[q][1];
+ cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1];
+
+ cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
+ cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
+ cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
+ cpi->uv_round[q][i] = cpi->uv_round[q][1];
+ cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1];
#if CONFIG_ALPHA
- invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
- quant_alpha_val);
- cpi->a_zbin[q][rc] =
- ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
- cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
- cpi->zrun_zbin_boost_a[q][i] =
- ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
+ cpi->a_quant[q][i] = cpi->a_quant[q][1];
+ cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
+ cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
+ cpi->a_round[q][i] = cpi->a_round[q][1];
+ cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1];
#endif
}
}
x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
x->plane[0].zbin = cpi->y_zbin[qindex];
x->plane[0].round = cpi->y_round[qindex];
- x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
x->plane[0].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
x->plane[i].zbin = cpi->uv_zbin[qindex];
x->plane[i].round = cpi->uv_round[qindex];
- x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
x->plane[i].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
}
x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
x->plane[3].zbin = cpi->a_zbin[qindex];
x->plane[3].round = cpi->a_round[qindex];
- x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
x->plane[3].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
#endif
#include <math.h>
#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_modecont.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/common/vp9_entropymode.h"
#define MAX_RD_THRESH_FREQ_FACT 32
#define MAX_RD_THRESH_FREQ_INC 1
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
- vp9_coeff_count (*cnoskip)[BLOCK_TYPES],
+static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
int i, j, k, l;
TX_SIZE t;
for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
vp9_prob probs[ENTROPY_NODES];
vp9_model_to_full_probs(p[t][i][j][k][l], probs);
- vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,
+ vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
vp9_coef_tree);
#if CONFIG_BALANCED_COEFTREE
// Replace the eob node prob with a very small value so that the
// cost approximately equals the cost without the eob node
probs[1] = 1;
- vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);
+ vp9_cost_tokens((int *)c[t][i][j][1][k][l], probs, vp9_coef_tree);
#else
- vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,
+ vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
vp9_coef_tree);
- assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==
- cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);
+ assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
+ c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
#endif
}
}
cpi->rd_threshes[bsize][i] = INT_MAX;
}
cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+ if (cpi->sf.adaptive_rd_thresh)
+ cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+ else
+ cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
} else {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+ if (cpi->sf.adaptive_rd_thresh)
+ cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+ else
+ cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
}
- fill_token_costs(cpi->mb.token_costs,
- cpi->mb.token_costs_noskip,
- cpi->common.fc.coef_probs);
+ fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
vp9_cost_tokens(cpi->mb.partition_cost[i],
cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
&cpi->common.fc.nmvc,
cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+
+ for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
+ MB_PREDICTION_MODE m;
+
+ for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
+ cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+ cost_token(vp9_sb_mv_ref_tree,
+ cpi->common.fc.inter_mode_probs[i],
+ vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+ }
+ }
+}
+
+static enum BlockSize get_block_size(int bw, int bh) {
+ if (bw == 4 && bh == 4)
+ return BLOCK_4X4;
+
+ if (bw == 4 && bh == 8)
+ return BLOCK_4X8;
+
+ if (bw == 8 && bh == 4)
+ return BLOCK_8X4;
+
+ if (bw == 8 && bh == 8)
+ return BLOCK_8X8;
+
+ if (bw == 8 && bh == 16)
+ return BLOCK_8X16;
+
+ if (bw == 16 && bh == 8)
+ return BLOCK_16X8;
+
+ if (bw == 16 && bh == 16)
+ return BLOCK_16X16;
+
+ if (bw == 32 && bh == 32)
+ return BLOCK_32X32;
+
+ if (bw == 32 && bh == 16)
+ return BLOCK_32X16;
+
+ if (bw == 16 && bh == 32)
+ return BLOCK_16X32;
+
+ if (bw == 64 && bh == 32)
+ return BLOCK_64X32;
+
+ if (bw == 32 && bh == 64)
+ return BLOCK_32X64;
+
+ if (bw == 64 && bh == 64)
+ return BLOCK_64X64;
+
+ assert(0);
+ return -1;
+}
+
+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+ struct macroblockd_plane *pd) {
+ return get_block_size(plane_block_width(bsize, pd),
+ plane_block_height(bsize, pd));
+}
+
+static double linear_interpolate(double x, int ntab, int inv_step,
+ const double *tab) {
+ double y = x * inv_step;
+ int d = (int) y;
+ if (d >= ntab - 1) {
+ return tab[ntab - 1];
+ } else {
+ double a = y - d;
+ return tab[d] * (1 - a) + tab[d + 1] * a;
+ }
+}
+
+static double model_rate_norm(double x) {
+ // Normalized rate
+ // This function models the rate for a Laplacian source
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const int inv_rate_tab_step = 8;
+ static const double rate_tab[] = {
+ 64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
+ 2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
+ 1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
+ 0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
+ 0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
+ 0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
+ 0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
+ 0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
+ 0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
+ 0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
+ 0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
+ 0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
+ 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
+ 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
+ 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
+ };
+ const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
+ assert(x >= 0.0);
+ return linear_interpolate(x, rate_tab_num, inv_rate_tab_step, rate_tab);
+}
+
+static double model_dist_norm(double x) {
+ // Normalized distortion
+ // This function models the normalized distortion for a Laplacian source
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance)
+ // Note the actual distortion is Dn * variance.
+ static const int inv_dist_tab_step = 8;
+ static const double dist_tab[] = {
+ 0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
+ 0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
+ 0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
+ 0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
+ 0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
+ 0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
+ 0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
+ 0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
+ 0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
+ 0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
+ 0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
+ 0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
+ 0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
+ 0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
+ };
+ const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
+ assert(x >= 0.0);
+ return linear_interpolate(x, dist_tab_num, inv_dist_tab_step, dist_tab);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+ int *rate, int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ vp9_clear_system_state();
+ if (var == 0 || n == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ double D, R;
+ double s2 = (double) var / n;
+ double x = qstep / sqrt(s2);
+ D = model_dist_norm(x);
+ R = model_rate_norm(x);
+ if (R < 0) {
+ R = 0;
+ D = var;
+ }
+ *rate = (n * R * 256 + 0.5);
+ *dist = (n * D * s2 + 0.5);
+ }
+ vp9_clear_system_state();
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int i, rate_sum = 0, dist_sum = 0;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+
+ // TODO(dkovalev) the same code in get_plane_block_size
+ const int bw = plane_block_width(bsize, pd);
+ const int bh = plane_block_height(bsize, pd);
+ const enum BlockSize bs = get_block_size(bw, bh);
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ // sse works better than var, since there is no dc prediction used
+ model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+
+ rate_sum += rate;
+ dist_sum += dist;
+ }
+
+ *out_rate_sum = rate_sum;
+ *out_dist_sum = dist_sum << 4;
+}
+
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ TX_SIZE tx_size,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *out_skip) {
+ int t, j, k;
+ enum BlockSize bs;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bw = plane_block_width(bsize, pd);
+ const int bh = plane_block_height(bsize, pd);
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
+
+ if (tx_size == TX_4X4) {
+ bs = BLOCK_4X4;
+ t = 4;
+ } else if (tx_size == TX_8X8) {
+ bs = BLOCK_8X8;
+ t = 8;
+ } else if (tx_size == TX_16X16) {
+ bs = BLOCK_16X16;
+ t = 16;
+ } else if (tx_size == TX_32X32) {
+ bs = BLOCK_32X32;
+ t = 32;
+ } else {
+ assert(0);
+ }
+ assert(bs <= get_block_size(bw, bh));
+ *out_skip = 1;
+ for (j = 0; j < bh; j+=t) {
+ for (k = 0; k < bw; k+=t) {
+ int rate;
+ int64_t dist;
+ unsigned int sse;
+ (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
+ p->src.stride,
+ pd->dst.buf + j * pd->dst.stride + k,
+ pd->dst.stride, &sse);
+ // sse works better than var, since there is no dc prediction used
+ model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
+ &rate, &dist);
+ rate_sum += rate;
+ dist_sum += dist;
+ *out_skip &= (rate < 1024);
+ }
}
+ *out_rate_sum = rate_sum;
+ *out_dist_sum = (dist_sum << 4);
}
int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
- intptr_t block_size) {
+ intptr_t block_size, int64_t *ssz) {
int i;
- int64_t error = 0;
+ int64_t error = 0, sqcoeff = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
error += (unsigned)this_diff * this_diff;
+ sqcoeff += (unsigned) coeff[i] * coeff[i];
}
+ *ssz = sqcoeff;
return error;
}
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int pt;
int c = 0;
- int cost = 0, pad;
- const int *scan, *nb;
+ int cost = 0;
+ const int16_t *scan, *nb;
const int eob = xd->plane[plane].eobs[block];
- const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
- block, 16);
+ const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
- unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
+ unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+ [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
ENTROPY_CONTEXT above_ec, left_ec;
TX_TYPE tx_type = DCT_DCT;
-
const int segment_id = xd->mode_info_context->mbmi.segment_id;
- unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs_noskip[tx_size][type][ref];
-
- int seg_eob, default_eob;
+ int seg_eob;
uint8_t token_cache[1024];
const uint8_t * band_translate;
break;
}
case TX_8X8: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_8x8(xd) : DCT_DCT;
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
scan = get_scan_8x8(tx_type);
break;
}
case TX_16X16: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_16x16(xd) : DCT_DCT;
scan = get_scan_16x16(tx_type);
seg_eob = 256;
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
band_translate = vp9_coefband_trans_8x8plus;
break;
default:
- abort();
+ assert(0);
break;
}
assert(eob <= seg_eob);
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
+ nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
if (eob < seg_eob)
assert(qcoeff_ptr[scan[eob]] == 0);
- {
- for (c = 0; c < eob; c++) {
- int v = qcoeff_ptr[scan[c]];
- int t = vp9_dct_value_tokens_ptr[v].token;
- int band = get_coef_band(band_translate, c);
- if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
-
- if (!c || token_cache[scan[c - 1]]) // do not skip eob
- cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
- else
- cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
- token_cache[scan[c]] = vp9_pt_energy_class[t];
+ if (eob == 0) {
+ // single eob token
+ cost += token_costs[0][0][pt][DCT_EOB_TOKEN];
+ } else {
+ int v, prev_t;
+
+ // dc token
+ v = qcoeff_ptr[0];
+ prev_t = vp9_dct_value_tokens_ptr[v].token;
+ cost += token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+ token_cache[0] = vp9_pt_energy_class[prev_t];
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ const int band = get_coef_band(band_translate, c);
+ int t;
+
+ v = qcoeff_ptr[rc];
+ t = vp9_dct_value_tokens_ptr[v].token;
+ pt = get_coef_context(nb, token_cache, c);
+ cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+ token_cache[rc] = vp9_pt_energy_class[t];
+ prev_t = t;
}
+
+ // eob token
if (c < seg_eob) {
- if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
- cost += mb->token_costs_noskip[tx_size][type][ref]
- [get_coef_band(band_translate, c)]
- [pt][DCT_EOB_TOKEN];
+ pt = get_coef_context(nb, token_cache, c);
+ cost += token_costs[0][get_coef_band(band_translate, c)][pt]
+ [DCT_EOB_TOKEN];
}
}
return cost;
}
+struct rdcost_block_args {
+ VP9_COMMON *cm;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[16];
+ ENTROPY_CONTEXT t_left[16];
+ TX_SIZE tx_size;
+ int bw;
+ int bh;
+ int rate;
+ int64_t dist;
+ int64_t sse;
+ int64_t best_rd;
+ int skip;
+};
+
+static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args* args = arg;
+ MACROBLOCK* const x = args->x;
+ MACROBLOCKD* const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ int64_t this_sse;
+ int shift = args->tx_size == TX_32X32 ? 0 : 2;
+ int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+ &this_sse) >> shift;
+ args->sse += this_sse >> shift;
+}
+
+static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args* args = arg;
+ int x_idx, y_idx;
+ MACROBLOCKD * const xd = &args->x->e_mbd;
+
+ txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+ &y_idx);
+
+ args->rate += cost_coeffs(args->cm, args->x, plane, block,
+ xd->plane[plane].plane_type, args->t_above + x_idx,
+ args->t_left + y_idx, args->tx_size,
+ args->bw * args->bh);
+}
+
+// FIXME(jingning): need to make the rd test of chroma components consistent
+// with that of luma component. this function should be deprecated afterwards.
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD * const xd = &x->e_mbd;
+ const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+ const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+ const int bw = 1 << bwl, bh = 1 << bhl;
+ struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+ 0, 0, 0, INT64_MAX, 0 };
+
+ vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
+ sizeof(ENTROPY_CONTEXT) * bw);
+ vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
+ sizeof(ENTROPY_CONTEXT) * bh);
+
+ foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
+ return args.rate;
+}
+
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ int cost = 0, plane;
+
+ for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+ cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+ }
+ return cost;
+}
+
+static int block_error(int16_t *coeff, int16_t *dqcoeff,
+ int block_size, int shift) {
+ int i;
+ int64_t error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ int this_diff = coeff[i] - dqcoeff[i];
+ error += (unsigned)this_diff * this_diff;
+ }
+ error >>= shift;
+
+ return error > INT_MAX ? INT_MAX : (int)error;
+}
+
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift, int64_t *sse) {
+ struct macroblockd_plane *p = &x->e_mbd.plane[0];
+ const int bw = plane_block_width(bsize, p);
+ const int bh = plane_block_height(bsize, p);
+ int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+ bw * bh, sse) >> shift;
+ *sse >>= shift;
+ return e;
+}
+
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift, int64_t *sse) {
+ int64_t sum = 0, this_sse;
+ int plane;
+
+ *sse = 0;
+ for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+ struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+ const int bw = plane_block_width(bsize, p);
+ const int bh = plane_block_height(bsize, p);
+ sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+ bw * bh, &this_sse);
+ *sse += this_sse;
+ }
+ *sse >>= shift;
+ return sum >> shift;
+}
+
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct encode_b_args encode_args = {args->cm, x, NULL};
+
+ if (args->skip)
+ return;
+ if (RDCOST(x->rdmult, x->rddiv, args->rate, args->dist) > args->best_rd) {
+ args->skip = 1;
+ args->rate = INT_MAX;
+ args->dist = INT64_MAX;
+ args->sse = INT64_MAX;
+ return;
+ }
+
+ if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+ encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+ else
+ xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+
+ dist_block(plane, block, bsize, ss_txfrm_size, args);
+ rate_block(plane, block, bsize, ss_txfrm_size, args);
+}
+
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skippable, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x;
+ const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y;
+ const int bw = 1 << bwl, bh = 1 << bhl;
+ struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+ 0, 0, 0, ref_best_rd, 0 };
+ xd->mode_info_context->mbmi.txfm_size = tx_size;
+ vpx_memcpy(&args.t_above, pd->above_context, sizeof(ENTROPY_CONTEXT) * bw);
+ vpx_memcpy(&args.t_left, pd->left_context, sizeof(ENTROPY_CONTEXT) * bh);
+
+ foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+ *distortion = args.dist;
+ *rate = args.rate;
+ *sse = args.sse;
+ *skippable = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
+}
+
+static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skip, int64_t *sse,
+ BLOCK_SIZE_TYPE bs) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ if (max_txfm_size == TX_32X32 &&
+ (cm->txfm_mode == ALLOW_32X32 ||
+ cm->txfm_mode == TX_MODE_SELECT)) {
+ mbmi->txfm_size = TX_32X32;
+ } else if (max_txfm_size >= TX_16X16 &&
+ (cm->txfm_mode == ALLOW_16X16 ||
+ cm->txfm_mode == ALLOW_32X32 ||
+ cm->txfm_mode == TX_MODE_SELECT)) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (cm->txfm_mode != ONLY_4X4) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
+ super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+ &sse[mbmi->txfm_size], INT64_MAX, bs,
+ mbmi->txfm_size);
+ cpi->txfm_stepdown_count[0]++;
+}
+
static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int (*r)[2], int *rate,
int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t txfm_cache[NB_TXFM_MODES],
- TX_SIZE max_txfm_size) {
+ BLOCK_SIZE_TYPE bs) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
for (n = TX_4X4; n <= max_txfm_size; n++) {
r[n][1] = r[n][0];
+ if (r[n][0] == INT_MAX)
+ continue;
for (m = 0; m <= n - (n == max_txfm_size); m++) {
if (m == n)
r[n][1] += vp9_cost_zero(tx_probs[m]);
s1 = vp9_cost_bit(skip_prob, 1);
for (n = TX_4X4; n <= max_txfm_size; n++) {
+ if (d[n] == INT64_MAX) {
+ rd[n][0] = rd[n][1] = INT64_MAX;
+ continue;
+ }
if (s[n]) {
rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
} else {
else
txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
rd[TX_4X4][1] : rd[TX_8X8][1];
-}
-
-static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
- int shift) {
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
- 16 << (bwl + bhl)) >> shift;
-}
-static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
- int shift) {
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- int64_t sum = 0;
- int plane;
-
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- const int subsampling = x->e_mbd.plane[plane].subsampling_x +
- x->e_mbd.plane[plane].subsampling_y;
- sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
- 16 << (bwl + bhl - subsampling));
+ if (max_txfm_size == TX_32X32 &&
+ rd[TX_32X32][1] < rd[TX_16X16][1] &&
+ rd[TX_32X32][1] < rd[TX_8X8][1] &&
+ rd[TX_32X32][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[0]++;
+ } else if (max_txfm_size >= TX_16X16 &&
+ rd[TX_16X16][1] < rd[TX_8X8][1] &&
+ rd[TX_16X16][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+ } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+ } else {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
}
- return sum >> shift;
-}
-
-struct rdcost_block_args {
- VP9_COMMON *cm;
- MACROBLOCK *x;
- ENTROPY_CONTEXT t_above[16];
- ENTROPY_CONTEXT t_left[16];
- TX_SIZE tx_size;
- int bw;
- int bh;
- int cost;
-};
-
-static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
- struct rdcost_block_args* args = arg;
- int x_idx, y_idx;
- MACROBLOCKD * const xd = &args->x->e_mbd;
-
- txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
- &y_idx);
-
- args->cost += cost_coeffs(args->cm, args->x, plane, block,
- xd->plane[plane].plane_type, args->t_above + x_idx,
- args->t_left + y_idx, args->tx_size,
- args->bw * args->bh);
}
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- MACROBLOCKD * const xd = &x->e_mbd;
- const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
- const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
- const int bw = 1 << bwl, bh = 1 << bhl;
- struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };
+static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
+ int (*r)[2], int *rate,
+ int64_t *d, int64_t *distortion,
+ int *s, int *skip, int64_t *sse,
+ BLOCK_SIZE_TYPE bs,
+ int *model_used) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+ int64_t rd[TX_SIZE_MAX_SB][2];
+ int n, m;
+ int s0, s1;
+ double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
+ // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
- vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
- sizeof(ENTROPY_CONTEXT) * bw);
- vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
- sizeof(ENTROPY_CONTEXT) * bh);
+ const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
- foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);
+ // for (n = TX_4X4; n <= max_txfm_size; n++)
+ // r[n][0] = (r[n][0] * scale_r[n]);
- return args.cost;
-}
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ r[n][1] = r[n][0];
+ for (m = 0; m <= n - (n == max_txfm_size); m++) {
+ if (m == n)
+ r[n][1] += vp9_cost_zero(tx_probs[m]);
+ else
+ r[n][1] += vp9_cost_one(tx_probs[m]);
+ }
+ }
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- int cost = 0, plane;
+ assert(skip_prob > 0);
+ s0 = vp9_cost_bit(skip_prob, 0);
+ s1 = vp9_cost_bit(skip_prob, 1);
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ if (s[n]) {
+ rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+ } else {
+ rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+ rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+ }
+ }
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ rd[n][0] = (scale_rd[n] * rd[n][0]);
+ rd[n][1] = (scale_rd[n] * rd[n][1]);
}
- return cost;
-}
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int64_t *distortion,
- int *skippable,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- xd->mode_info_context->mbmi.txfm_size = tx_size;
+ if (max_txfm_size == TX_32X32 &&
+ (cm->txfm_mode == ALLOW_32X32 ||
+ (cm->txfm_mode == TX_MODE_SELECT &&
+ rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+ rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+ rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
+ mbmi->txfm_size = TX_32X32;
+ } else if (max_txfm_size >= TX_16X16 &&
+ (cm->txfm_mode == ALLOW_16X16 ||
+ cm->txfm_mode == ALLOW_32X32 ||
+ (cm->txfm_mode == TX_MODE_SELECT &&
+ rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+ rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (cm->txfm_mode == ALLOW_8X8 ||
+ cm->txfm_mode == ALLOW_16X16 ||
+ cm->txfm_mode == ALLOW_32X32 ||
+ (cm->txfm_mode == TX_MODE_SELECT &&
+ rd[TX_8X8][1] <= rd[TX_4X4][1])) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
- vp9_encode_intra_block_y(cm, x, bsize);
- else
- vp9_xform_quant_sby(cm, x, bsize);
+ if (model_used[mbmi->txfm_size]) {
+ // Actually encode using the chosen mode if a model was used, but do not
+ // update the r, d costs
+ super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+ &sse[mbmi->txfm_size], INT64_MAX,
+ bs, mbmi->txfm_size);
+ } else {
+ *distortion = d[mbmi->txfm_size];
+ *rate = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
+ *skip = s[mbmi->txfm_size];
+ }
- *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
- *rate = rdcost_plane(cm, x, 0, bsize, tx_size);
- *skippable = vp9_sby_is_skippable(xd, bsize);
+ if (max_txfm_size == TX_32X32 &&
+ rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+ rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+ rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[0]++;
+ } else if (max_txfm_size >= TX_16X16 &&
+ rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+ rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+ } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+ } else {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+ }
}
static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCK *x, int *rate, int64_t *distortion,
- int *skip, BLOCK_SIZE_TYPE bs,
- int64_t txfm_cache[NB_TXFM_MODES]) {
+ int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+ int64_t txfm_cache[NB_TXFM_MODES],
+ int64_t ref_best_rd) {
VP9_COMMON *const cm = &cpi->common;
int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
- int64_t d[TX_SIZE_MAX_SB];
+ int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sby(x, bs);
- if (cpi->sf.use_largest_txform) {
- if (bs >= BLOCK_SIZE_SB32X32) {
- mbmi->txfm_size = TX_32X32;
- } else if (bs >= BLOCK_SIZE_MB16X16) {
- mbmi->txfm_size = TX_16X16;
- } else if (bs >= BLOCK_SIZE_SB8X8) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
+ (cpi->sf.tx_size_search_method != USE_FULL_RD &&
+ mbmi->ref_frame[0] == INTRA_FRAME)) {
vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
- super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
- mbmi->txfm_size);
+ choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, bs);
+ if (psse)
+ *psse = sse[mbmi->txfm_size];
return;
}
- if (bs >= BLOCK_SIZE_SB32X32)
- super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
- bs, TX_32X32);
- if (bs >= BLOCK_SIZE_MB16X16)
- super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
- bs, TX_16X16);
- super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
- TX_8X8);
- super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
- TX_4X4);
-
- choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
- skip, txfm_cache,
- TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
- - (bs < BLOCK_SIZE_MB16X16));
+
+ if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
+ mbmi->ref_frame[0] > INTRA_FRAME) {
+ int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
+ if (bs >= BLOCK_SIZE_SB32X32) {
+ if (model_used[TX_32X32]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+ &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
+ bs, TX_32X32);
+ }
+ }
+ if (bs >= BLOCK_SIZE_MB16X16) {
+ if (model_used[TX_16X16]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+ &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
+ bs, TX_16X16);
+ }
+ }
+ if (model_used[TX_8X8]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+ &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
+ }
+ if (model_used[TX_4X4]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+ &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
+ }
+ choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
+ skip, sse, bs, model_used);
+ } else {
+ if (bs >= BLOCK_SIZE_SB32X32)
+ super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
+ bs, TX_32X32);
+ if (bs >= BLOCK_SIZE_MB16X16)
+ super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
+ bs, TX_16X16);
+ super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
+ super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
+ choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+ skip, txfm_cache, bs);
+ }
+ if (psse)
+ *psse = sse[mbmi->txfm_size];
+}
+
+static int conditional_skipintra(MB_PREDICTION_MODE mode,
+ MB_PREDICTION_MODE best_intra_mode) {
+ if (mode == D117_PRED &&
+ best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D63_PRED &&
+ best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D27_PRED &&
+ best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D153_PRED &&
+ best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
}
static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
int rate = 0;
int64_t distortion;
VP9_COMMON *const cm = &cpi->common;
- const int src_stride = x->plane[0].src.stride;
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+ const int src_stride = p->src.stride;
uint8_t *src, *dst;
int16_t *src_diff, *coeff;
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode))
+ continue;
+ }
rate = bmode_costs[mode];
distortion = 0;
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
+ int64_t ssz;
+
block = ib + idy * 2 + idx;
- xd->mode_info_context->bmi[block].as_mode.first = mode;
+ xd->mode_info_context->bmi[block].as_mode = mode;
src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- x->plane[0].src.buf, src_stride);
+ p->src.buf, src_stride);
src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
- x->plane[0].src_diff);
+ p->src_diff);
coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
- vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
- dst, xd->plane[0].dst.stride);
+ pd->dst.buf,
+ pd->dst.stride);
+ vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+ TX_4X4, mode,
+ dst, pd->dst.stride,
+ dst, pd->dst.stride);
vp9_subtract_block(4, 4, src_diff, 8,
src, src_stride,
- dst, xd->plane[0].dst.stride);
+ dst, pd->dst.stride);
tx_type = get_tx_type_4x4(xd, block);
if (tx_type != DCT_DCT) {
ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
tempa + idx, templ + idy, TX_4X4, 16);
- distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
- block, 16), 16) >> 2;
+ distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
+ block, 16),
+ 16, &ssz) >> 2;
if (best_tx_type != DCT_DCT)
- vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
- dst, xd->plane[0].dst.stride, best_tx_type);
+ vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ dst, pd->dst.stride, best_tx_type);
else
- xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
- dst, xd->plane[0].dst.stride);
+ xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ dst, pd->dst.stride);
}
}
for (idx = 0; idx < bw; ++idx) {
block = ib + idy * 2 + idx;
vpx_memcpy(best_dqcoeff[idy * 2 + idx],
- BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+ BLOCK_OFFSET(pd->dqcoeff, block, 16),
sizeof(best_dqcoeff[0]));
}
}
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
block = ib + idy * 2 + idx;
- xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+ xd->mode_info_context->bmi[block].as_mode = *best_mode;
dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
+ pd->dst.buf,
+ pd->dst.stride);
- vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
- dst, xd->plane[0].dst.stride);
+ vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
+ *best_mode, dst, pd->dst.stride,
+ dst, pd->dst.stride);
// inverse transform
if (best_tx_type != DCT_DCT)
vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
- xd->plane[0].dst.stride, best_tx_type);
+ pd->dst.stride, best_tx_type);
else
xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
- xd->plane[0].dst.stride);
+ pd->dst.stride);
}
}
distortion += d;
tot_rate_y += ry;
- mic->bmi[i].as_mode.first = best_mode;
+ mic->bmi[i].as_mode = best_mode;
for (j = 1; j < bh; ++j)
- mic->bmi[i + j * 2].as_mode.first = best_mode;
+ mic->bmi[i + j * 2].as_mode = best_mode;
for (j = 1; j < bw; ++j)
- mic->bmi[i + j].as_mode.first = best_mode;
+ mic->bmi[i + j].as_mode = best_mode;
if (total_rd >= best_rd)
break;
*Rate = cost;
*rate_y = tot_rate_y;
*Distortion = distortion;
- xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
+ xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
}
return best_rd;
}
- for (i = 0; i < NB_TXFM_MODES; i++)
- txfm_cache[i] = INT64_MAX;
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ for (i = 0; i < NB_TXFM_MODES; i++)
+ txfm_cache[i] = INT64_MAX;
+ }
/* Y Search for 32x32 intra prediction mode */
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
}
x->e_mbd.mode_info_context->mbmi.mode = mode;
- super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
- bsize, local_txfm_cache);
+ super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
+ bsize, local_txfm_cache, best_rd);
+
+ if (this_rate_tokenonly == INT_MAX)
+ continue;
this_rate = this_rate_tokenonly + bmode_costs[mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
*skippable = s;
}
- for (i = 0; i < NB_TXFM_MODES; i++) {
- int64_t adj_rd = this_rd + local_txfm_cache[i] -
- local_txfm_cache[cpi->common.txfm_mode];
- if (adj_rd < txfm_cache[i]) {
- txfm_cache[i] = adj_rd;
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ int64_t adj_rd = this_rd + local_txfm_cache[i] -
+ local_txfm_cache[cpi->common.txfm_mode];
+ if (adj_rd < txfm_cache[i]) {
+ txfm_cache[i] = adj_rd;
+ }
}
}
}
static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion,
- int *skippable, BLOCK_SIZE_TYPE bsize,
+ int *skippable, int64_t *sse,
+ BLOCK_SIZE_TYPE bsize,
TX_SIZE uv_tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t dummy;
if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
vp9_encode_intra_block_uv(cm, x, bsize);
else
vp9_xform_quant_sbuv(cm, x, bsize);
- *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+ *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
+ sse ? sse : &dummy);
*rate = rdcost_uv(cm, x, bsize, uv_tx_size);
*skippable = vp9_sbuv_is_skippable(xd, bsize);
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ int64_t *sse, BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sbuv(x, bsize);
- if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_32X32);
- } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_16X16);
- } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_8X8);
- } else {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_4X4);
- }
+ super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
+ uv_txfm_size);
}
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
- &this_distortion, &s, bsize);
+ &this_distortion, &s, NULL, bsize);
this_rate = this_rate_tokenonly +
x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
return best_rd;
}
-int vp9_cost_mv_ref(VP9_COMP *cpi,
- MB_PREDICTION_MODE m,
- const int mode_context) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
+static int cost_mv_ref(VP9_COMP *cpi,
+ MB_PREDICTION_MODE m,
+ const int mode_context) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
int segment_id = xd->mode_info_context->mbmi.segment_id;
// Dont account for mode here if segment skip is enabled.
if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- VP9_COMMON *pc = &cpi->common;
assert(NEARESTMV <= m && m <= NEWMV);
- return cost_token(vp9_sb_mv_ref_tree,
- pc->fc.inter_mode_probs[mode_context],
- vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+ return x->inter_mode_cost[mode_context][m - NEARESTMV];
} else
return 0;
}
break;
}
- cost = vp9_cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ cost = cost_mv_ref(cpi, this_mode,
+ mbmi->mb_mode_context[mbmi->ref_frame[0]]);
mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
if (mbmi->ref_frame[1] > 0)
int k;
MACROBLOCKD *xd = &x->e_mbd;
BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- int bwl = b_width_log2(bsize), bw = 1 << bwl;
- int bhl = b_height_log2(bsize), bh = 1 << bhl;
+ const int bw = plane_block_width(bsize, &xd->plane[0]);
+ const int bh = plane_block_height(bsize, &xd->plane[0]);
int idx, idy;
const int src_stride = x->plane[0].src.stride;
- uint8_t* const src =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- x->plane[0].src.buf, src_stride);
- int16_t* src_diff =
- raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
- x->plane[0].src_diff);
+ uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ x->plane[0].src.buf,
+ src_stride);
+ int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+ x->plane[0].src_diff);
int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
- uint8_t* const pre =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- xd->plane[0].pre[0].buf,
- xd->plane[0].pre[0].stride);
- uint8_t* const dst =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
+ uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride);
int64_t thisdistortion = 0;
int thisrate = 0;
xd->plane[0].dst.stride,
&xd->mode_info_context->bmi[i].as_mv[0],
&xd->scale_factor[0],
- 4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+ bw, bh, 0 /* no avg */, &xd->subpix,
+ MV_PRECISION_Q3);
// TODO(debargha): Make this work properly with the
// implicit-compoundinter-weight experiment when implicit
vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
dst, xd->plane[0].dst.stride,
&xd->mode_info_context->bmi[i].as_mv[1],
- &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
- &xd->subpix);
+ &xd->scale_factor[1], bw, bh, 1,
+ &xd->subpix, MV_PRECISION_Q3);
}
- vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+ vp9_subtract_block(bh, bw, src_diff, 8,
src, src_stride,
dst, xd->plane[0].dst.stride);
k = i;
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ for (idy = 0; idy < bh / 4; ++idy) {
+ for (idx = 0; idx < bw / 4; ++idx) {
+ int64_t ssz;
+
k += (idy * 2 + idx);
src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
x->plane[0].src_diff);
x->quantize_b_4x4(x, k, DCT_DCT, 16);
thisdistortion += vp9_block_error(coeff,
BLOCK_OFFSET(xd->plane[0].dqcoeff,
- k, 16), 16);
+ k, 16), 16, &ssz);
thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
ta + (k & 1),
tl + (k >> 1), TX_4X4, 16);
return r;
}
-static enum BlockSize get_block_size(int bw, int bh) {
- if (bw == 4 && bh == 4)
- return BLOCK_4X4;
-
- if (bw == 4 && bh == 8)
- return BLOCK_4X8;
-
- if (bw == 8 && bh == 4)
- return BLOCK_8X4;
-
- if (bw == 8 && bh == 8)
- return BLOCK_8X8;
-
- if (bw == 8 && bh == 16)
- return BLOCK_8X16;
-
- if (bw == 16 && bh == 8)
- return BLOCK_16X8;
-
- if (bw == 16 && bh == 16)
- return BLOCK_16X16;
-
- if (bw == 32 && bh == 32)
- return BLOCK_32X32;
-
- if (bw == 32 && bh == 16)
- return BLOCK_32X16;
-
- if (bw == 16 && bh == 32)
- return BLOCK_16X32;
-
- if (bw == 64 && bh == 32)
- return BLOCK_64X32;
-
- if (bw == 32 && bh == 64)
- return BLOCK_32X64;
-
- if (bw == 64 && bh == 64)
- return BLOCK_64X64;
-
- assert(0);
- return -1;
-}
-
static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
x->plane[0].src.buf =
// adjust src pointers
mi_buf_shift(x, i);
- if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
int rate_mv;
joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
mi_row, mi_col, seg_mvs[i],
}
static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
- int mode_index,
- PARTITION_INFO *partition,
- int_mv *ref_mv,
- int_mv *second_ref_mv,
- int64_t comp_pred_diff[NB_PREDICTION_TYPES],
- int64_t txfm_size_diff[NB_TXFM_MODES]) {
+ int mode_index,
+ PARTITION_INFO *partition,
+ int_mv *ref_mv,
+ int_mv *second_ref_mv,
+ int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+ int64_t txfm_size_diff[NB_TXFM_MODES],
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+ // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
+ // doesn't actually work this way
memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+ memcpy(ctx->best_filter_diff, best_filter_diff,
+ sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
}
static void setup_pred_block(const MACROBLOCKD *xd,
return scaled_ref_frame;
}
-static double linear_interpolate(double x, int ntab, double step,
- const double *tab) {
- double y = x / step;
- int d = (int) y;
- double a = y - d;
- if (d >= ntab - 1)
- return tab[ntab - 1];
- else
- return tab[d] * (1 - a) + tab[d + 1] * a;
-}
-
-static double model_rate_norm(double x) {
- // Normalized rate
- // This function models the rate for a Laplacian source
- // source with given variance when quantized with a uniform quantizer
- // with given stepsize. The closed form expressions are in:
- // Hang and Chen, "Source Model for transform video coder and its
- // application - Part I: Fundamental Theory", IEEE Trans. Circ.
- // Sys. for Video Tech., April 1997.
- static const double rate_tab_step = 0.125;
- static const double rate_tab[] = {
- 256.0000, 4.944453, 3.949276, 3.371593,
- 2.965771, 2.654550, 2.403348, 2.193612,
- 2.014208, 1.857921, 1.719813, 1.596364,
- 1.484979, 1.383702, 1.291025, 1.205767,
- 1.126990, 1.053937, 0.985991, 0.922644,
- 0.863472, 0.808114, 0.756265, 0.707661,
- 0.662070, 0.619287, 0.579129, 0.541431,
- 0.506043, 0.472828, 0.441656, 0.412411,
- 0.384980, 0.359260, 0.335152, 0.312563,
- 0.291407, 0.271600, 0.253064, 0.235723,
- 0.219508, 0.204351, 0.190189, 0.176961,
- 0.164611, 0.153083, 0.142329, 0.132298,
- 0.122945, 0.114228, 0.106106, 0.098541,
- 0.091496, 0.084937, 0.078833, 0.073154,
- 0.067872, 0.062959, 0.058392, 0.054147,
- 0.050202, 0.046537, 0.043133, 0.039971,
- 0.037036, 0.034312, 0.031783, 0.029436,
- 0.027259, 0.025240, 0.023367, 0.021631,
- 0.020021, 0.018528, 0.017145, 0.015863,
- 0.014676, 0.013575, 0.012556, 0.011612,
- 0.010738, 0.009929, 0.009180, 0.008487,
- 0.007845, 0.007251, 0.006701, 0.006193,
- 0.005722, 0.005287, 0.004884, 0.004512,
- 0.004168, 0.003850, 0.003556, 0.003284,
- 0.003032, 0.002800, 0.002585, 0.002386,
- 0.002203, 0.002034, 0.001877, 0.001732,
- 0.001599, 0.001476, 0.001362, 0.001256,
- 0.001159, 0.001069, 0.000987, 0.000910,
- 0.000840, 0.000774, 0.000714, 0.000659,
- 0.000608, 0.000560, 0.000517, 0.000476,
- 0.000439, 0.000405, 0.000373, 0.000344,
- 0.000317, 0.000292, 0.000270, 0.000248,
- 0.000229, 0.000211, 0.000195, 0.000179,
- 0.000165, 0.000152, 0.000140, 0.000129,
- 0.000119, 0.000110, 0.000101, 0.000093,
- 0.000086, 0.000079, 0.000073, 0.000067,
- 0.000062, 0.000057, 0.000052, 0.000048,
- 0.000044, 0.000041, 0.000038, 0.000035,
- 0.000032, 0.000029, 0.000027, 0.000025,
- 0.000023, 0.000021, 0.000019, 0.000018,
- 0.000016, 0.000015, 0.000014, 0.000013,
- 0.000012, 0.000011, 0.000010, 0.000009,
- 0.000008, 0.000008, 0.000007, 0.000007,
- 0.000006, 0.000006, 0.000005, 0.000005,
- 0.000004, 0.000004, 0.000004, 0.000003,
- 0.000003, 0.000003, 0.000003, 0.000002,
- 0.000002, 0.000002, 0.000002, 0.000002,
- 0.000002, 0.000001, 0.000001, 0.000001,
- 0.000001, 0.000001, 0.000001, 0.000001,
- 0.000001, 0.000001, 0.000001, 0.000001,
- 0.000001, 0.000001, 0.000000, 0.000000,
- };
- const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
- assert(x >= 0.0);
- return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab);
-}
-
-static double model_dist_norm(double x) {
- // Normalized distortion
- // This function models the normalized distortion for a Laplacian source
- // source with given variance when quantized with a uniform quantizer
- // with given stepsize. The closed form expression is:
- // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
- // where x = qpstep / sqrt(variance)
- // Note the actual distortion is Dn * variance.
- static const double dist_tab_step = 0.25;
- static const double dist_tab[] = {
- 0.000000, 0.005189, 0.020533, 0.045381,
- 0.078716, 0.119246, 0.165508, 0.215979,
- 0.269166, 0.323686, 0.378318, 0.432034,
- 0.484006, 0.533607, 0.580389, 0.624063,
- 0.664475, 0.701581, 0.735418, 0.766092,
- 0.793751, 0.818575, 0.840761, 0.860515,
- 0.878045, 0.893554, 0.907238, 0.919281,
- 0.929857, 0.939124, 0.947229, 0.954306,
- 0.960475, 0.965845, 0.970512, 0.974563,
- 0.978076, 0.981118, 0.983750, 0.986024,
- 0.987989, 0.989683, 0.991144, 0.992402,
- 0.993485, 0.994417, 0.995218, 0.995905,
- 0.996496, 0.997002, 0.997437, 0.997809,
- 0.998128, 0.998401, 0.998635, 0.998835,
- 0.999006, 0.999152, 0.999277, 0.999384,
- 0.999475, 0.999553, 0.999619, 0.999676,
- 0.999724, 0.999765, 0.999800, 0.999830,
- 0.999855, 0.999877, 0.999895, 0.999911,
- 0.999924, 0.999936, 0.999945, 0.999954,
- 0.999961, 0.999967, 0.999972, 0.999976,
- 0.999980, 0.999983, 0.999985, 0.999988,
- 0.999989, 0.999991, 0.999992, 0.999994,
- 0.999995, 0.999995, 0.999996, 0.999997,
- 0.999997, 0.999998, 0.999998, 0.999998,
- 0.999999, 0.999999, 0.999999, 0.999999,
- 0.999999, 0.999999, 0.999999, 1.000000,
- };
- const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
- assert(x >= 0.0);
- return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab);
-}
-
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
- int *rate, int64_t *dist) {
- // This function models the rate and distortion for a Laplacian
- // source with given variance when quantized with a uniform quantizer
- // with given stepsize. The closed form expression is:
- // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
- // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance)
- vp9_clear_system_state();
- if (var == 0 || n == 0) {
- *rate = 0;
- *dist = 0;
- } else {
- double D, R;
- double s2 = (double) var / n;
- double x = qstep / sqrt(s2);
- // TODO(debargha): Make the modeling functions take (qstep^2 / s2)
- // as argument rather than qstep / sqrt(s2) to obviate the need for
- // the sqrt() operation.
- D = model_dist_norm(x);
- R = model_rate_norm(x);
- if (R < 0) {
- R = 0;
- D = var;
- }
- *rate = (n * R * 256 + 0.5);
- *dist = (n * D * s2 + 0.5);
- }
- vp9_clear_system_state();
-}
-
-static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
- struct macroblockd_plane *pd) {
- return get_block_size(plane_block_width(bsize, pd),
- plane_block_height(bsize, pd));
-}
-
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd,
- int *out_rate_sum, int64_t *out_dist_sum) {
- // Note our transform coeffs are 8 times an orthogonal transform.
- // Hence quantizer step is also 8 times. To get effective quantizer
- // we need to divide by 8 before sending to modeling function.
- unsigned int sse;
- int i, rate_sum = 0;
- int64_t dist_sum = 0;
-
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- struct macroblock_plane *const p = &x->plane[i];
- struct macroblockd_plane *const pd = &xd->plane[i];
-
- // TODO(dkovalev) the same code in get_plane_block_size
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- const enum BlockSize bs = get_block_size(bw, bh);
- int rate;
- int64_t dist;
- cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride, &sse);
- model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
-
- rate_sum += rate;
- dist_sum += dist;
- }
-
- *out_rate_sum = rate_sum;
- *out_dist_sum = dist_sum << 4;
-}
-
static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
int bestsme = INT_MAX;
- int further_steps, step_param = cpi->sf.first_step;
+ int further_steps, step_param;
int sadpb = x->sadperbit16;
int_mv mvp_full;
int ref = mbmi->ref_frame[0];
int_mv ref_mv = mbmi->ref_mvs[ref][0];
- int sr = 0;
const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
int tmp_col_min = x->mv_col_min;
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, NULL);
}
vp9_clamp_mv_min_max(x, &ref_mv);
- sr = vp9_init_search_range(cpi->common.width, cpi->common.height);
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+ if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+ step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude);
+ } else {
+ step_param = vp9_init_search_range(
+ cpi, MIN(cpi->common.width, cpi->common.height));
+ }
// mvp_full.as_int = ref_mv[0].as_int;
mvp_full.as_int =
mvp_full.as_mv.col >>= 3;
mvp_full.as_mv.row >>= 3;
- // adjust search range according to sr from mv prediction
- step_param = MAX(step_param, sr);
-
// Further step/diamond searches as necessary
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
// motion search code to be used without additional modifications.
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL, NULL);
}
if (scaled_ref_frame[1]) {
for (i = 0; i < MAX_MB_PLANE; i++)
backup_second_yv12[i] = xd->plane[i].pre[1];
- setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL, NULL);
}
xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
&frame_mv[refs[!id]],
&xd->scale_factor[!id],
pw, ph, 0,
- &xd->subpix);
+ &xd->subpix, MV_PRECISION_Q3);
// Compound motion search on first ref frame.
if (id)
INTERPOLATIONFILTERTYPE *best_filter,
int_mv *frame_mv,
int mi_row, int mi_col,
- int_mv single_newmv[MAX_REF_FRAMES]) {
- const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-
+ int_mv single_newmv[MAX_REF_FRAMES],
+ int64_t *psse, int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
- const enum BlockSize uv_block_size = get_plane_block_size(bsize,
- &xd->plane[1]);
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const int is_comp_pred = (mbmi->ref_frame[1] > 0);
const int num_refs = is_comp_pred ? 2 : 1;
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv cur_mv[2];
int64_t this_rd = 0;
- unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
int interpolating_intpel_seen = 0;
int intpel_mv;
int64_t rd, best_rd = INT64_MAX;
+ int best_needs_copy = 0;
+ uint8_t *orig_dst[MAX_MB_PLANE];
+ int orig_dst_stride[MAX_MB_PLANE];
switch (this_mode) {
int rate_mv;
frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
- if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
joint_motion_search(cpi, x, bsize, frame_mv,
mi_row, mi_col, single_newmv, &rate_mv);
} else {
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ orig_dst[i] = xd->plane[i].dst.buf;
+ orig_dst_stride[i] = xd->plane[i].dst.stride;
+ }
+
/* We don't include the cost of the second reference here, because there
* are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
* words if you present them in that order, the second one is always known
* if the first is known */
- *rate2 += vp9_cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ *rate2 += cost_mv_ref(cpi, this_mode,
+ mbmi->mb_mode_context[mbmi->ref_frame[0]]);
pred_exists = 0;
interpolating_intpel_seen = 0;
// pred error irrespective of whether the filter will be used
if (cpi->sf.use_8tap_always) {
*best_filter = EIGHTTAP;
+ vp9_zero(cpi->rd_filter_cache);
} else {
int i, newbest;
int tmp_rate_sum = 0;
int64_t tmp_dist_sum = 0;
+
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- int rs = 0;
+ int rs, j;
+ int64_t rs_rd;
const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
const int is_intpel_interp = intpel_mv &&
vp9_is_interpolating_filter[filter];
mbmi->interp_filter = filter;
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
- if (cm->mcomp_filter_type == SWITCHABLE)
- rs = get_switchable_rate(cm, x);
+ rs = get_switchable_rate(cm, x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
if (interpolating_intpel_seen && is_intpel_interp) {
- rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ tmp_rate_sum, tmp_dist_sum);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
} else {
int rate_sum = 0;
int64_t dist_sum = 0;
+ if ((cm->mcomp_filter_type == SWITCHABLE &&
+ i && !best_needs_copy) ||
+ (cm->mcomp_filter_type != SWITCHABLE &&
+ cm->mcomp_filter_type != mbmi->interp_filter)) {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+ xd->plane[j].dst.stride = 64;
+ }
+ } else {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = orig_dst[j];
+ xd->plane[j].dst.stride = orig_dst_stride[j];
+ }
+ }
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
- rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ rate_sum, dist_sum);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
if (!interpolating_intpel_seen && is_intpel_interp) {
tmp_rate_sum = rate_sum;
tmp_dist_sum = dist_sum;
if (newbest) {
best_rd = rd;
*best_filter = mbmi->interp_filter;
+ if (cm->mcomp_filter_type == SWITCHABLE && i &&
+ !(interpolating_intpel_seen && is_intpel_interp))
+ best_needs_copy = !best_needs_copy;
}
if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
(cm->mcomp_filter_type != SWITCHABLE &&
cm->mcomp_filter_type == mbmi->interp_filter)) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
- const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
- int i;
-
- for (i = 0; i < y; i++)
- vpx_memcpy(&tmp_buf[p][64 * i],
- xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
- }
pred_exists = 1;
}
interpolating_intpel_seen |= is_intpel_interp;
}
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
}
// Set the appripriate filter
cm->mcomp_filter_type : *best_filter;
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
if (pred_exists) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
- const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
- int i;
-
- for (i = 0; i < y; i++)
- vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
- &tmp_buf[p][64 * i], x);
+ if (best_needs_copy) {
+ // again temporarily set the buffers to local memory to prevent a memcpy
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+ xd->plane[i].dst.stride = 64;
+ }
}
} else {
// Handles the special case when a filter that is not in the
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
x->skip = 1;
else if (x->encode_breakout) {
+ const enum BlockSize y_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const enum BlockSize uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+
unsigned int var, sse;
- int threshold = (xd->plane[0].dequant[1]
- * xd->plane[0].dequant[1] >> 4);
+ int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
+
if (threshold < x->encode_breakout)
threshold = x->encode_breakout;
- var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
- x->plane[0].src.stride,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride,
- &sse);
+ var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+ &sse);
if ((int)sse < threshold) {
unsigned int q2dc = xd->plane[0].dequant[0];
- /* If there is no codeable 2nd order dc
- or a very small uniform pixel change change */
+ // If there is no codeable 2nd order dc
+ // or a very small uniform pixel change change
if ((sse - var < q2dc * q2dc >> 4) ||
(sse / 2 > var && sse - var < 64)) {
// Check u and v to make sure skip is ok
int sse2;
unsigned int sse2u, sse2v;
- var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
- x->plane[1].src.stride,
- xd->plane[1].dst.buf,
- xd->plane[1].dst.stride, &sse2u);
- var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
- x->plane[1].src.stride,
- xd->plane[2].dst.buf,
- xd->plane[1].dst.stride, &sse2v);
+ var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+ x->plane[1].src.stride,
+ xd->plane[1].dst.buf,
+ xd->plane[1].dst.stride, &sse2u);
+ var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+ x->plane[2].src.stride,
+ xd->plane[2].dst.buf,
+ xd->plane[2].dst.stride, &sse2v);
sse2 = sse2u + sse2v;
if (sse2 * 2 < threshold) {
*distortion = sse + sse2;
*rate2 = 500;
- /* for best_yrd calculation */
+ // for best_yrd calculation
*rate_uv = 0;
*distortion_uv = sse2;
if (!x->skip) {
int skippable_y, skippable_uv;
+ int64_t sseuv = INT_MAX;
// Y cost and distortion
- super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
- bsize, txfm_cache);
+ super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+ bsize, txfm_cache, ref_best_rd);
+
+ if (*rate_y == INT_MAX) {
+ *rate2 = INT_MAX;
+ *distortion = INT64_MAX;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
+ }
*rate2 += *rate_y;
*distortion += *distortion_y;
super_block_uvrd(cm, x, rate_uv, distortion_uv,
- &skippable_uv, bsize);
+ &skippable_uv, &sseuv, bsize);
+ *psse += sseuv;
*rate2 += *rate_uv;
*distortion += *distortion_uv;
*skippable = skippable_y && skippable_uv;
}
}
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+
return this_rd; // if 0, this will be re-calculated by caller
}
int rate4x4_y, rate4x4_y_tokenonly;
int64_t dist4x4_y;
int64_t err4x4 = INT64_MAX;
- int i;
vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
ctx->skip = 0;
vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
xd->mode_info_context->mbmi.txfm_size = TX_4X4;
} else {
+ int i;
*returnrate = rate_y + rate_uv +
vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
*returndist = dist_y + (dist_uv >> 2);
- for (i = 0; i < NB_TXFM_MODES; i++) {
- ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+ }
}
xd->mode_info_context->mbmi.txfm_size = txfm_size;
xd->mode_info_context->mbmi.mode = mode;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
MB_PREDICTION_MODE this_mode;
- MB_PREDICTION_MODE best_mode = DC_PRED;
MV_REFERENCE_FRAME ref_frame;
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred, i;
int64_t best_txfm_diff[NB_TXFM_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
MB_MODE_INFO best_mbmode;
int j;
int mode_index, best_mode_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
- int64_t best_overall_rd = INT64_MAX;
- INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
+ int64_t best_intra_rd = INT64_MAX;
+ int64_t best_inter_rd = INT64_MAX;
+ MB_PREDICTION_MODE best_intra_mode = DC_PRED;
+ // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
+ MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
int64_t dist_uv[TX_SIZE_MAX_SB];
int bws = (1 << bwsl) / 4; // mode_info step for subsize
int bhsl = b_height_log2(bsize);
int bhs = (1 << bhsl) / 4; // mode_info step for subsize
+ int best_skip2 = 0;
for (i = 0; i < 4; i++) {
int j;
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < NB_TXFM_MODES; i++)
best_txfm_rd[i] = INT64_MAX;
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ best_filter_rd[i] = INT64_MAX;
// Create a mask set to 1 for each frame used by a smaller resolution.
if (cpi->sf.use_avoid_tested_higherror) {
int skippable;
int64_t txfm_cache[NB_TXFM_MODES];
int i;
+ int this_skip2 = 0;
+ int64_t total_sse = INT_MAX;
+ int early_term = 0;
for (i = 0; i < NB_TXFM_MODES; ++i)
txfm_cache[i] = INT64_MAX;
+ this_mode = vp9_mode_order[mode_index].mode;
+ ref_frame = vp9_mode_order[mode_index].ref_frame;
+
+ // Slip modes that have been masked off but always consider first mode.
+ if ( mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
+ (cpi->unused_mode_skip_mask & (1 << mode_index)) )
+ continue;
+
+ // Skip if the current refernce frame has been masked off
+ if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+ (cpi->ref_frame_mask & (1 << ref_frame)))
+ continue;
+
// Test best rd so far against threshold for trying this mode.
if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
continue;
x->skip = 0;
- this_mode = vp9_mode_order[mode_index].mode;
- ref_frame = vp9_mode_order[mode_index].ref_frame;
if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
if (!(ref_frame_mask & (1 << ref_frame))) {
continue;
}
+ comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ if (comp_pred) {
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+ continue;
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+ if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+ vp9_mode_order[mode_index].second_ref_frame != best_inter_ref_frame)
+ continue;
+ }
// TODO(jingning, jkoleszar): scaling reference frame not supported for
// SPLITMV.
if (mbmi->ref_frame[0] > 0 &&
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
- comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
mbmi->mode = this_mode;
mbmi->uv_mode = DC_PRED;
if (this_mode == I4X4_PRED) {
int rate;
+ /*
+ if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
+ continue;
+ */
+
mbmi->txfm_size = TX_4X4;
rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
&distortion_y, INT64_MAX);
txfm_cache[i] = txfm_cache[ONLY_4X4];
} else if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
- super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
- bsize, txfm_cache);
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
+ continue;
+ }
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mbmi->mode, best_intra_mode))
+ continue;
+ }
+ super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+ bsize, txfm_cache, best_rd);
+
+ if (rate_y == INT_MAX)
+ continue;
uv_tx = mbmi->txfm_size;
if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
PARTITION_INFO tmp_best_partition;
int pred_exists = 0;
int uv_skippable;
+ if (is_comp_pred) {
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+ continue;
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+ if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+ vp9_mode_order[mode_index].second_ref_frame !=
+ best_inter_ref_frame)
+ continue;
+ }
this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?
cpi->rd_threshes[bsize][THR_NEWMV] :
cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
for (switchable_filter_index = 0;
switchable_filter_index < VP9_SWITCHABLE_FILTERS;
++switchable_filter_index) {
- int newbest;
+ int newbest, rs;
+ int64_t rs_rd;
mbmi->interp_filter =
vp9_switchable_interp[switchable_filter_index];
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
&skippable,
(int)this_rd_thresh, seg_mvs,
mi_row, mi_col);
- if (cpi->common.mcomp_filter_type == SWITCHABLE) {
- const int rs = get_switchable_rate(cm, x);
- tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
- }
+ cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+ rs = get_switchable_rate(cm, x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ tmp_rd += rs_rd;
newbest = (tmp_rd < tmp_best_rd);
if (newbest) {
tmp_best_filter = mbmi->interp_filter;
BLOCK_SIZE_SB8X8);
vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
- &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
+ &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);
rate2 += rate_uv;
distortion2 += distortion_uv;
skippable = skippable && uv_skippable;
&mode_excluded, &disable_skip,
&tmp_best_filter, frame_mv[this_mode],
mi_row, mi_col,
- single_newmv);
+ single_newmv, &total_sse, best_rd);
if (this_rd == INT64_MAX)
continue;
}
rate2 += prob_skip_cost;
}
}
+ } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
+ this_mode != SPLITMV) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+ // Add in the cost of the no skip flag.
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+ PRED_MBSKIP), 0);
+ rate2 += prob_skip_cost;
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+ PRED_MBSKIP), 1);
+ rate2 += prob_skip_cost;
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ this_skip2 = 1;
+ }
} else if (mb_skip_allowed) {
// Add in the cost of the no skip flag.
int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
- PRED_MBSKIP), 0);
+ PRED_MBSKIP), 0);
rate2 += prob_skip_cost;
}
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
-#if 0
- // Keep record of best intra distortion
- if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) &&
- (this_rd < best_intra_rd)) {
+ // Keep record of best intra rd
+ if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
+ xd->mode_info_context->mbmi.mode <= TM_PRED &&
+ this_rd < best_intra_rd) {
best_intra_rd = this_rd;
- *returnintra = distortion2;
+ best_intra_mode = xd->mode_info_context->mbmi.mode;
+ }
+ // Keep record of best inter rd with single reference
+ if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
+ xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
+ !mode_excluded &&
+ this_rd < best_inter_rd) {
+ best_inter_rd = this_rd;
+ best_inter_ref_frame = ref_frame;
+ // best_inter_mode = xd->mode_info_context->mbmi.mode;
}
-#endif
- if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME)
+ if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
- if (this_rd < best_overall_rd) {
- best_overall_rd = this_rd;
- best_filter = tmp_best_filter;
- best_mode = this_mode;
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
if (this_rd < best_rd || x->skip) {
if (!mode_excluded) {
// Note index of best mode so far
+ const int qstep = xd->plane[0].dequant[1];
+
best_mode_index = mode_index;
if (ref_frame == INTRA_FRAME) {
*returndistortion = distortion2;
best_rd = this_rd;
best_mbmode = *mbmi;
+ best_skip2 = this_skip2;
best_partition = *x->partition_info;
if (this_mode == I4X4_PRED || this_mode == SPLITMV)
for (i = 0; i < 4; i++)
best_bmodes[i] = xd->mode_info_context->bmi[i];
+
+ // TODO(debargha): enhance this test with a better distortion prediction
+ // based on qp, activity mask and history
+ if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE)
+ if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep)
+ early_term = 1;
}
#if 0
// Testing this mode gave rise to an improvement in best error score.
best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
}
+ /* keep record of best filter type */
+ if (!mode_excluded && !disable_skip && mbmi->ref_frame[0] != INTRA_FRAME &&
+ cm->mcomp_filter_type != BILINEAR) {
+ int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+ VP9_SWITCHABLE_FILTERS :
+ vp9_switchable_interp_map[cm->mcomp_filter_type]];
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ int64_t adj_rd;
+ // In cases of poor prediction, filter_cache[] can contain really big
+ // values, which actually are bigger than this_rd itself. This can
+ // cause negative best_filter_rd[] values, which is obviously silly.
+ // Therefore, if filter_cache < ref, we do an adjusted calculation.
+ if (cpi->rd_filter_cache[i] >= ref)
+ adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+ else // FIXME(rbultje) do this for comppred also
+ adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+ best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+ }
+ }
+
/* keep record of best txfm size */
if (bsize < BLOCK_SIZE_SB32X32) {
if (bsize < BLOCK_SIZE_MB16X16) {
}
}
+ if (early_term)
+ break;
+
if (x->skip && !mode_excluded)
break;
}
+
+ // If indicated then mark the index of the chosen mode to be inspected at
+ // other block sizes.
+ if (bsize <= cpi->sf.unused_mode_skip_lvl) {
+ cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
+ (~((int64_t)1 << best_mode_index));
+ }
+
+ // If we are using reference masking and the set mask flag is set then
+ // create the reference frame mask.
+ if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+ cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
+
// Flag all modes that have a distortion thats > 2x the best we found at
// this level.
for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
(cm->mcomp_filter_type == best_mbmode.interp_filter) ||
(best_mbmode.ref_frame[0] == INTRA_FRAME));
- // Accumulate filter usage stats
- // TODO(agrange): Use RD criteria to select interpolation filter mode.
- if (is_inter_mode(best_mode))
- ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
// Updating rd_thresh_freq_fact[] here means that the differnt
// partition/block sizes are handled independently based on the best
// choice for the current partition. It may well be better to keep a scaled
// best rd so far value and update rd_thresh_freq_fact based on the mode/size
// combination that wins out.
- if (cpi->sf.adpative_rd_thresh) {
+ if (cpi->sf.adaptive_rd_thresh) {
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
if (mode_index == best_mode_index) {
cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
} else {
cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+ cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
}
}
}
vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+ vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
goto end;
}
// macroblock modes
*mbmi = best_mbmode;
+ x->skip |= best_skip2;
if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
for (i = 0; i < 4; i++)
}
if (!x->skip) {
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ if (best_filter_rd[i] == INT64_MAX)
+ best_filter_diff[i] = 0;
+ else
+ best_filter_diff[i] = best_rd - best_filter_rd[i];
+ }
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
+ } else {
+ vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+ }
+
+ if (!x->skip) {
for (i = 0; i < NB_TXFM_MODES; i++) {
if (best_txfm_rd[i] == INT64_MAX)
best_txfm_diff[i] = 0;
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
- best_pred_diff, best_txfm_diff);
+ best_pred_diff, best_txfm_diff, best_filter_diff);
return best_rd;
}
#include <stdlib.h>
#include "vp9/common/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "./vp9_rtcd.h"
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
-}
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
-}
+#define sad_mxn_func(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int max_sad) { \
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred, \
+ unsigned int max_sad) { \
+ uint8_t comp_pred[m * n]; \
+ comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+ return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
+}
+
+sad_mxn_func(64, 64)
+sad_mxn_func(64, 32)
+sad_mxn_func(32, 64)
+sad_mxn_func(32, 32)
+sad_mxn_func(32, 16)
+sad_mxn_func(16, 32)
+sad_mxn_func(16, 16)
+sad_mxn_func(16, 8)
+sad_mxn_func(8, 16)
+sad_mxn_func(8, 8)
+sad_mxn_func(8, 4)
+sad_mxn_func(4, 8)
+sad_mxn_func(4, 4)
void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
int src_stride,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
-}
-
void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
-}
-
void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
-}
-
void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
-}
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
-}
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
void vp9_sad64x64x3_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
return cost;
}
-static void count_segs(VP9_COMP *cpi,
- MODE_INFO *mi,
+static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
- const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type,
- mi_row, mi_col);
- const int seg_predicted = (segment_id == pred_seg_id);
-
- // Get the segment id prediction context
+ const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+ bsize, mi_row, mi_col);
+ const int pred_flag = pred_segment_id == segment_id;
const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
// Store the prediction status for this mb and update counts
// as appropriate
- vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
- temporal_predictor_count[pred_context][seg_predicted]++;
+ vp9_set_pred_flag(xd, bsize, PRED_SEG_ID, pred_flag);
+ temporal_predictor_count[pred_context][pred_flag]++;
- if (!seg_predicted)
+ if (!pred_flag)
// Update the "unpredicted" segment count
t_unpred_seg_counts[segment_id]++;
}
double ssim_total = 0;
// sample point start with each 4x4 location
- for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j < width - 8; j += 4) {
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
ssim_total += v;
samples++;
double ssimv;
a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height);
b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
ssimv = a * .8 + .1 * (b + c);
double a, b, c;
a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height);
b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
*ssim_y = a;
*ssim_u = b;
*ssim_v = c;
--- /dev/null
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/encoder/vp9_boolhuff.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
+#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+static int update_bits[255];
+
+static int count_uniform(int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0) return 0;
+ m = (1 << l) - n;
+ if (v < m)
+ return l - 1;
+ else
+ return l;
+}
+
+static int split_index(int i, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (i % modulus == modulus / 2)
+ i = i / modulus;
+ else
+ i = max1 + i - (i + modulus - modulus / 2) / modulus;
+ return i;
+}
+
+static int recenter_nonneg(int v, int m) {
+ if (v > (m << 1))
+ return v;
+ else if (v >= m)
+ return ((v - m) << 1);
+ else
+ return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+ int i;
+ static int map_table[MAX_PROB - 1] = {
+ // generated by:
+ // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88,
+ 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102,
+ 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+ 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+ 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171,
+ 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185,
+ 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199,
+ 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213,
+ 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+ 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+ 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+ };
+ v--;
+ m--;
+ if ((m << 1) <= MAX_PROB)
+ i = recenter_nonneg(v, m) - 1;
+ else
+ i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+ i = map_table[i];
+ return i;
+}
+
+static int count_term_subexp(int word, int k, int num_syms) {
+ int count = 0;
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ count += count_uniform(word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ count++;
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ count += b;
+ break;
+ }
+ }
+ }
+ return count;
+}
+
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ return update_bits[delp] * 256;
+}
+
+static void encode_uniform(vp9_writer *w, int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0)
+ return;
+ m = (1 << l) - n;
+ if (v < m) {
+ vp9_write_literal(w, v, l - 1);
+ } else {
+ vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
+ vp9_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ encode_uniform(w, word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ vp9_write_literal(w, t, 1);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ vp9_write_literal(w, word - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+ const int delp = remap_prob(newp, oldp);
+ encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
+}
+
+void vp9_compute_update_table() {
+ int i;
+ for (i = 0; i < 254; i++)
+ update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
+}
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vp9_prob oldp, vp9_prob *bestp,
+ vp9_prob upd) {
+ const int old_b = cost_branch256(ct, oldp);
+ int bestsavings = 0;
+ vp9_prob newp, bestnewp = oldp;
+ const int step = *bestp > oldp ? -1 : 1;
+
+ for (newp = *bestp; newp != oldp; newp += step) {
+ const int new_b = cost_branch256(ct, newp);
+ const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+ const int savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vp9_prob *oldp,
+ vp9_prob *bestp,
+ vp9_prob upd,
+ int b, int r) {
+ int i, old_b, new_b, update_b, savings, bestsavings, step;
+ int newp;
+ vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+ vp9_model_to_full_probs(oldp, oldplist);
+ vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+ for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+ old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+ old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+
+ bestsavings = 0;
+ bestnewp = oldp[PIVOT_NODE];
+
+ step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
+
+ for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
+ if (newp < 1 || newp > 255)
+ continue;
+ newplist[PIVOT_NODE] = newp;
+ vp9_model_to_full_probs(newplist, newplist);
+ for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i]);
+ new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+ update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+ vp9_cost_upd256;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+ vp9_prob upd, unsigned int *ct) {
+ vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+ const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
+ upd);
+ assert(newp >= 1);
+ if (savings > 0) {
+ vp9_write(w, 1, upd);
+ vp9_write_prob_diff_update(w, newp, *oldp);
+ *oldp = newp;
+ } else {
+ vp9_write(w, 0, upd);
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_SUBEXP_H_
+#define VP9_DECODER_VP9_SUBEXP_H_
+
+void vp9_compute_update_table();
+
+
+void vp9_write_prob_diff_update(vp9_writer *w,
+ vp9_prob newp, vp9_prob oldp);
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+ vp9_prob upd, unsigned int *ct);
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vp9_prob oldp, vp9_prob *bestp,
+ vp9_prob upd);
+
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vp9_prob *oldp,
+ vp9_prob *bestp,
+ vp9_prob upd,
+ int b, int r);
+
+#endif // VP9_DECODER_VP9_SUBEXP_H_
&xd->scale_factor[which_mv],
16, 16,
which_mv,
- &xd->subpix);
+ &xd->subpix, MV_PRECISION_Q3);
stride = (stride + 1) >> 1;
- vp9_build_inter_predictor_q4(u_mb_ptr, stride,
- &pred[256], 8,
- &mv,
- &xd->scale_factor_uv[which_mv],
- 8, 8,
- which_mv,
- &xd->subpix);
-
- vp9_build_inter_predictor_q4(v_mb_ptr, stride,
- &pred[320], 8,
- &mv,
- &xd->scale_factor_uv[which_mv],
- 8, 8,
- which_mv,
- &xd->subpix);
+ vp9_build_inter_predictor(u_mb_ptr, stride,
+ &pred[256], 8,
+ &mv,
+ &xd->scale_factor_uv[which_mv],
+ 8, 8,
+ which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
+
+ vp9_build_inter_predictor(v_mb_ptr, stride,
+ &pred[320], 8,
+ &mv,
+ &xd->scale_factor_uv[which_mv],
+ 8, 8,
+ which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
}
void vp9_temporal_filter_apply_c(uint8_t *frame1,
// Further step/diamond searches as necessary
if (cpi->speed < 8)
- step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+ step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0);
else
- step_param = cpi->sf.first_step + 2;
+ step_param = cpi->sf.reduce_first_step_size + 2;
+ step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
/*cpi->sf.search_method == HEX*/
// TODO Check that the 16x16 vf & sdf are selected here
vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
}
-extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
-
struct tokenize_b_args {
VP9_COMP *cpi;
MACROBLOCKD *xd;
const int loff = (off >> mod) << tx_size;
ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
- int seg_eob, default_eob, pad;
+ int seg_eob;
const int segment_id = mbmi->segment_id;
- const int *scan, *nb;
+ const int16_t *scan, *nb;
vp9_coeff_count *counts;
vp9_coeff_probs_model *coef_probs;
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
ENTROPY_CONTEXT above_ec, left_ec;
uint8_t token_cache[1024];
- TX_TYPE tx_type = DCT_DCT;
- const uint8_t * band_translate;
+ const uint8_t *band_translate;
assert((!type && !plane) || (type && plane));
counts = cpi->coef_counts[tx_size];
switch (tx_size) {
default:
case TX_4X4: {
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_4x4(xd, block) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_4x4(xd, block) : DCT_DCT;
above_ec = A[0] != 0;
left_ec = L[0] != 0;
seg_eob = 16;
break;
}
case TX_8X8: {
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_8x8(xd) : DCT_DCT;
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
seg_eob = 64;
break;
}
case TX_16X16: {
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+ const TX_TYPE tx_type = type == PLANE_TYPE_Y_WITH_DC ?
+ get_tx_type_16x16(xd) : DCT_DCT;
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
seg_eob = 256;
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
+ nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
int v = 0;
rc = scan[c];
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
if (c < eob) {
v = qcoeff_ptr[rc];
assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
int result = 1;
struct is_skippable_args args = {xd, &result};
- foreach_transformed_block_in_plane(xd, bsize, 0,
- is_skippable, &args);
+ foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args);
return result;
}
int ref_stride,
unsigned int max_sad);
+typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred,
+ unsigned int max_sad);
+
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride);
typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_subp_avg_variance_fn_t svaf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
+ vp9_sad_fn_t sdf;
+ vp9_sad_avg_fn_t sdaf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_subp_avg_variance_fn_t svaf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, uint8_t *ref, int ref_stride) {
+ int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vpx_ports/mem.h"
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
// The 2D transform is done with two passes which are actually pretty
vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
+static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i mask;
+
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 4);
+ in[1] = _mm_slli_epi16(in[1], 4);
+ in[2] = _mm_slli_epi16(in[2], 4);
+ in[3] = _mm_slli_epi16(in[3], 4);
+
+ mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+ in[0] = _mm_add_epi16(in[0], mask);
+ in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ __m128i out01 = _mm_add_epi16(in01, kOne);
+ __m128i out23 = _mm_add_epi16(in23, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+ _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ // Combine and transpose
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ // 00 10 20 30 01 11 21 31
+ // 02 12 22 32 03 13 23 33
+ // only use the first 4 16-bit integers
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+ u[0] = _mm_add_epi16(in[0], in[3]);
+ u[1] = _mm_add_epi16(in[1], in[2]);
+ u[2] = _mm_sub_epi16(in[1], in[2]);
+ u[3] = _mm_sub_epi16(in[0], in[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpacklo_epi16(u[2], u[3]);
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
+ u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ transpose_4x4(in);
+}
+
+void fadst4_1d_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+ const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+ const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+ __m128i in7 = _mm_add_epi16(in[0], in[1]);
+ in7 = _mm_sub_epi16(in7, in[3]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[2], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = v[2];
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4(in);
+}
+
+void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[4];
+ load_buffer_4x4(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct4_1d_sse2(in);
+ fdct4_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst4_1d_sse2(in);
+ fdct4_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct4_1d_sse2(in);
+ fadst4_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst4_1d_sse2(in);
+ fadst4_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_4x4(output, in);
+}
+
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const int stride = pitch >> 1;
int pass;
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
// Load input
- __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
// Pre-condition input (shift by two)
in0 = _mm_slli_epi16(in0, 2);
in1 = _mm_slli_epi16(in1, 2);
in6 = _mm_srai_epi16(in6, 1);
in7 = _mm_srai_epi16(in7, 1);
// store results
- _mm_storeu_si128((__m128i *)(output + 0 * 8), in0);
- _mm_storeu_si128((__m128i *)(output + 1 * 8), in1);
- _mm_storeu_si128((__m128i *)(output + 2 * 8), in2);
- _mm_storeu_si128((__m128i *)(output + 3 * 8), in3);
- _mm_storeu_si128((__m128i *)(output + 4 * 8), in4);
- _mm_storeu_si128((__m128i *)(output + 5 * 8), in5);
- _mm_storeu_si128((__m128i *)(output + 6 * 8), in6);
- _mm_storeu_si128((__m128i *)(output + 7 * 8), in7);
+ _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+ _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+ _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+ _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+ _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+ _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+ _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+ _mm_store_si128((__m128i *)(output + 7 * 8), in7);
}
}
+// load 8x8 array
+static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
+ in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 2);
+ in[1] = _mm_slli_epi16(in[1], 2);
+ in[2] = _mm_slli_epi16(in[2], 2);
+ in[3] = _mm_slli_epi16(in[3], 2);
+ in[4] = _mm_slli_epi16(in[4], 2);
+ in[5] = _mm_slli_epi16(in[5], 2);
+ in[6] = _mm_slli_epi16(in[6], 2);
+ in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, int const bit) {
+ __m128i sign0 = _mm_srai_epi16(res[0], 15);
+ __m128i sign1 = _mm_srai_epi16(res[1], 15);
+ __m128i sign2 = _mm_srai_epi16(res[2], 15);
+ __m128i sign3 = _mm_srai_epi16(res[3], 15);
+ __m128i sign4 = _mm_srai_epi16(res[4], 15);
+ __m128i sign5 = _mm_srai_epi16(res[5], 15);
+ __m128i sign6 = _mm_srai_epi16(res[6], 15);
+ __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+ res[0] = _mm_sub_epi16(res[0], sign0);
+ res[1] = _mm_sub_epi16(res[1], sign1);
+ res[2] = _mm_sub_epi16(res[2], sign2);
+ res[3] = _mm_sub_epi16(res[3], sign3);
+ res[4] = _mm_sub_epi16(res[4], sign4);
+ res[5] = _mm_sub_epi16(res[5], sign5);
+ res[6] = _mm_sub_epi16(res[6], sign6);
+ res[7] = _mm_sub_epi16(res[7], sign7);
+
+ res[0] = _mm_srai_epi16(res[0], bit);
+ res[1] = _mm_srai_epi16(res[1], bit);
+ res[2] = _mm_srai_epi16(res[2], bit);
+ res[3] = _mm_srai_epi16(res[3], bit);
+ res[4] = _mm_srai_epi16(res[4], bit);
+ res[5] = _mm_srai_epi16(res[5], bit);
+ res[6] = _mm_srai_epi16(res[6], bit);
+ res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
+ right_shift_8x8(res, 1);
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+ _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 44 54 45 55 46 56 47 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 25 35
+ // 44 54 64 74 45 55 65 75
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_sse2(__m128i *in) {
+ // constants
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 1
+ s0 = _mm_add_epi16(in[0], in[7]);
+ s1 = _mm_add_epi16(in[1], in[6]);
+ s2 = _mm_add_epi16(in[2], in[5]);
+ s3 = _mm_add_epi16(in[3], in[4]);
+ s4 = _mm_sub_epi16(in[3], in[4]);
+ s5 = _mm_sub_epi16(in[2], in[5]);
+ s6 = _mm_sub_epi16(in[1], in[6]);
+ s7 = _mm_sub_epi16(in[0], in[7]);
+
+ u0 = _mm_add_epi16(s0, s3);
+ u1 = _mm_add_epi16(s1, s2);
+ u2 = _mm_sub_epi16(s1, s2);
+ u3 = _mm_sub_epi16(s0, s3);
+ // interleave and perform butterfly multiplication/addition
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpackhi_epi16(u0, u1);
+ v2 = _mm_unpacklo_epi16(u2, u3);
+ v3 = _mm_unpackhi_epi16(u2, u3);
+
+ u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+ u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+ u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+ u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+ u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+ u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+ u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+ u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[4] = _mm_packs_epi32(u2, u3);
+ in[6] = _mm_packs_epi32(u6, u7);
+
+ // stage 2
+ // interleave and perform butterfly multiplication/addition
+ u0 = _mm_unpacklo_epi16(s6, s5);
+ u1 = _mm_unpackhi_epi16(s6, s5);
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+ u0 = _mm_packs_epi32(v0, v1);
+ u1 = _mm_packs_epi32(v2, v3);
+
+ // stage 3
+ s0 = _mm_add_epi16(s4, u0);
+ s1 = _mm_sub_epi16(s4, u0);
+ s2 = _mm_sub_epi16(s7, u1);
+ s3 = _mm_add_epi16(s7, u1);
+
+ // stage 4
+ u0 = _mm_unpacklo_epi16(s0, s3);
+ u1 = _mm_unpackhi_epi16(s0, s3);
+ u2 = _mm_unpacklo_epi16(s1, s2);
+ u3 = _mm_unpackhi_epi16(s1, s2);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+ v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+ v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+ v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+ v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+ v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+ v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+ v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v0, v1);
+ in[3] = _mm_packs_epi32(v4, v5);
+ in[5] = _mm_packs_epi32(v2, v3);
+ in[7] = _mm_packs_epi32(v6, v7);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+void fadst8_1d_sse2(__m128i *in) {
+ // Constants
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ // FIXME(jingning): do subtract using bit inversion?
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[8];
+ load_buffer_8x8(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct8_1d_sse2(in);
+ fdct8_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst8_1d_sse2(in);
+ fdct8_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct8_1d_sse2(in);
+ fadst8_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst8_1d_sse2(in);
+ fadst8_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_8x8(output, in, 8);
+}
+
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[256];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
int16_t *in = input;
int16_t *out = intermediate;
// Constants
__m128i res08, res09, res10, res11, res12, res13, res14, res15;
// Load and pre-condition input.
if (0 == pass) {
- in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
// x = x << 2
in00 = _mm_slli_epi16(in00, 2);
in01 = _mm_slli_epi16(in01, 2);
in14 = _mm_slli_epi16(in14, 2);
in15 = _mm_slli_epi16(in15, 2);
} else {
- in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
// x = (x + 1) >> 2
in00 = _mm_add_epi16(in00, kOne);
in01 = _mm_add_epi16(in01, kOne);
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
// Store results
- _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+ _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+ _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+ _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+ _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+ _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+ _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+ _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+ _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
}
out += 8*16;
}
out = output;
}
}
+
+void vp9_short_fdct32x32_rd_sse2(int16_t *input,
+ int16_t *output_org, int pitch) {
+ // Calculate pre-multiplied strides
+ const int str1 = pitch >> 1;
+ const int str2 = pitch;
+ const int str3 = pitch + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ int16_t *ina = in + 0 * str1;
+ int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[ 0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 4 * str1;
+ int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[ 4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 8 * str1;
+ int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[ 8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 12 * str1;
+ int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[ 0] = _mm_add_epi16(in00, in31);
+ step1[ 1] = _mm_add_epi16(in01, in30);
+ step1[ 2] = _mm_add_epi16(in02, in29);
+ step1[ 3] = _mm_add_epi16(in03, in28);
+ step1[28] = _mm_sub_epi16(in03, in28);
+ step1[29] = _mm_sub_epi16(in02, in29);
+ step1[30] = _mm_sub_epi16(in01, in30);
+ step1[31] = _mm_sub_epi16(in00, in31);
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[ 4] = _mm_add_epi16(in04, in27);
+ step1[ 5] = _mm_add_epi16(in05, in26);
+ step1[ 6] = _mm_add_epi16(in06, in25);
+ step1[ 7] = _mm_add_epi16(in07, in24);
+ step1[24] = _mm_sub_epi16(in07, in24);
+ step1[25] = _mm_sub_epi16(in06, in25);
+ step1[26] = _mm_sub_epi16(in05, in26);
+ step1[27] = _mm_sub_epi16(in04, in27);
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[ 8] = _mm_add_epi16(in08, in23);
+ step1[ 9] = _mm_add_epi16(in09, in22);
+ step1[10] = _mm_add_epi16(in10, in21);
+ step1[11] = _mm_add_epi16(in11, in20);
+ step1[20] = _mm_sub_epi16(in11, in20);
+ step1[21] = _mm_sub_epi16(in10, in21);
+ step1[22] = _mm_sub_epi16(in09, in22);
+ step1[23] = _mm_sub_epi16(in08, in23);
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = _mm_add_epi16(in12, in19);
+ step1[13] = _mm_add_epi16(in13, in18);
+ step1[14] = _mm_add_epi16(in14, in17);
+ step1[15] = _mm_add_epi16(in15, in16);
+ step1[16] = _mm_sub_epi16(in15, in16);
+ step1[17] = _mm_sub_epi16(in14, in17);
+ step1[18] = _mm_sub_epi16(in13, in18);
+ step1[19] = _mm_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
+ step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
+ step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
+ step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
+ step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
+ step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
+ step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
+ step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
+ step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
+ step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
+ step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+ }
+ // Stage 3
+ {
+ step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm_add_epi16(step2[24], step1[31]);
+ }
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
+ step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
+ step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
+ step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
+ step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
+ step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
+ step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
+ step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
+ step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
+ step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+ step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
+ step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
+ step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
+ step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
+ step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+ step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
+ step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
+ step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
+ step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
+ step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
+ step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
+ step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
+ step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
+ step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
+ step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
+ step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
+ step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
+ step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
+ step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
+ step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
+ step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
+ step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
+ step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
+ step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
+ step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
+ step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
+ step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
+ step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
+ step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
+ step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step3[10] = _mm_add_epi16(step3[10], kOne);
+ step3[11] = _mm_add_epi16(step3[11], kOne);
+ step3[12] = _mm_add_epi16(step3[12], kOne);
+ step3[13] = _mm_add_epi16(step3[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step3[16] = _mm_add_epi16(step3[16], kOne);
+ step3[17] = _mm_add_epi16(step3[17], kOne);
+ step3[18] = _mm_add_epi16(step3[18], kOne);
+ step3[19] = _mm_add_epi16(step3[19], kOne);
+ step3[20] = _mm_add_epi16(step3[20], kOne);
+ step3[21] = _mm_add_epi16(step3[21], kOne);
+ step3[22] = _mm_add_epi16(step3[22], kOne);
+ step3[23] = _mm_add_epi16(step3[23], kOne);
+ step3[24] = _mm_add_epi16(step3[24], kOne);
+ step3[25] = _mm_add_epi16(step3[25], kOne);
+ step3[26] = _mm_add_epi16(step3[26], kOne);
+ step3[27] = _mm_add_epi16(step3[27], kOne);
+ step3[28] = _mm_add_epi16(step3[28], kOne);
+ step3[29] = _mm_add_epi16(step3[29], kOne);
+ step3[30] = _mm_add_epi16(step3[30], kOne);
+ step3[31] = _mm_add_epi16(step3[31], kOne);
+ step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
+ step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
+ step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
+ step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
+ step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
+ step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
+ step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
+ step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
+ step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step3[10] = _mm_srai_epi16(step3[10], 2);
+ step3[11] = _mm_srai_epi16(step3[11], 2);
+ step3[12] = _mm_srai_epi16(step3[12], 2);
+ step3[13] = _mm_srai_epi16(step3[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step3[16] = _mm_srai_epi16(step3[16], 2);
+ step3[17] = _mm_srai_epi16(step3[17], 2);
+ step3[18] = _mm_srai_epi16(step3[18], 2);
+ step3[19] = _mm_srai_epi16(step3[19], 2);
+ step3[20] = _mm_srai_epi16(step3[20], 2);
+ step3[21] = _mm_srai_epi16(step3[21], 2);
+ step3[22] = _mm_srai_epi16(step3[22], 2);
+ step3[23] = _mm_srai_epi16(step3[23], 2);
+ step3[24] = _mm_srai_epi16(step3[24], 2);
+ step3[25] = _mm_srai_epi16(step3[25], 2);
+ step3[26] = _mm_srai_epi16(step3[26], 2);
+ step3[27] = _mm_srai_epi16(step3[27], 2);
+ step3[28] = _mm_srai_epi16(step3[28], 2);
+ step3[29] = _mm_srai_epi16(step3[29], 2);
+ step3[30] = _mm_srai_epi16(step3[30], 2);
+ step3[31] = _mm_srai_epi16(step3[31], 2);
+ }
+ // Stage 4
+ {
+ step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
+ step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
+ step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
+ step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
+ step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
+ step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
+ step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
+ step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
+ step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
+ step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+ }
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output;
+ if (0 == pass) {
+ output = &intermediate[column_start * 32];
+ } else {
+ output = &output_org[column_start * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
+ // Process next 8x8
+ output += 8;
+ }
+ }
+ }
+ }
+}
SECTION .text
-; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
INIT_XMM sse2
-cglobal block_error, 3, 3, 6, uqc, dqc, size
- pxor m4, m4 ; accumulator
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
pxor m5, m5 ; dedicated zero register
lea uqcq, [uqcq+sizeq*2]
lea dqcq, [dqcq+sizeq*2]
neg sizeq
.loop:
- mova m0, [uqcq+sizeq*2]
- mova m2, [dqcq+sizeq*2]
- mova m1, [uqcq+sizeq*2+mmsize]
- mova m3, [dqcq+sizeq*2+mmsize]
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
; accumulate in 64bit
- punpckldq m2, m0, m5
+ punpckldq m7, m0, m5
punpckhdq m0, m5
- punpckldq m3, m1, m5
- punpckhdq m1, m5
- paddq m4, m2
+ paddq m4, m7
+ punpckldq m7, m1, m5
paddq m4, m0
- paddq m4, m3
+ punpckhdq m1, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
paddq m4, m1
+ punpckhdq m2, m5
+ paddq m6, m7
+ punpckldq m7, m3, m5
+ paddq m6, m2
+ punpckhdq m3, m5
+ paddq m6, m7
+ paddq m6, m3
add sizeq, mmsize
jl .loop
; accumulate horizontally and store in return value
movhlps m5, m4
+ movhlps m7, m6
paddq m4, m5
+ paddq m6, m7
%if ARCH_X86_64
movq rax, m4
+ movq [sszq], m6
%else
+ mov eax, sszm
pshufd m5, m4, 0x1
+ movq [eax], m6
movd eax, m4
movd edx, m5
%endif
--- /dev/null
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ movd m4, dword zbin_oqm ; m4 = zbin_oq
+ mova m0, [zbinq] ; m0 = zbin
+ punpcklwd m4, m4
+ mova m1, [roundq] ; m1 = round
+ pshufd m4, m4, 0
+ mova m2, [quantq] ; m2 = quant
+ paddw m0, m4 ; m0 = zbin + zbin_oq
+ mova m3, [r2q] ; m3 = dequant
+ psubw m0, [pw_1]
+ mov r2, shiftmp
+ mov r3, qcoeffmp
+ mova m4, [r2] ; m4 = shift
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+ pxor m5, m5 ; m5 = dedicated zero
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea iscanq, [ iscanq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+ paddw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+ mova [qcoeffq+ncoeffq*2+ 0], m8
+ mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+ mova [dqcoeffq+ncoeffq*2+ 0], m8
+ mova [dqcoeffq+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6, m7
+ pmovmskb r2, m12
+ or r6, r2
+ jz .skip_iter
+%endif
+ paddw m6, m1 ; m6 += round
+ paddw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+ mova [qcoeffq+ncoeffq*2+ 0], m14
+ mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [dqcoeffq+ncoeffq*2+ 0], m14
+ mova [dqcoeffq+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+ mova [qcoeffq+ncoeffq*2+ 0], m5
+ mova [qcoeffq+ncoeffq*2+16], m5
+ mova [dqcoeffq+ncoeffq*2+ 0], m5
+ mova [dqcoeffq+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw [r2], m8, 0
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [dqcoeffq+ncoeffq*2+ 0], m7
+ mova [dqcoeffq+ncoeffq*2+16], m7
+ mova [qcoeffq+ncoeffq*2+ 0], m7
+ mova [qcoeffq+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [eobq], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7
SECTION .text
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD64XN 1
-cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
.loop:
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+32]
INIT_XMM sse2
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD32XN 1
-cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
mov n_rowsd, %1/2
pxor m0, m0
-
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+ref_strideq]
movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+src_strideq]
SAD32XN 64 ; sad32x64_sse2
SAD32XN 32 ; sad32x32_sse2
SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
movu m2, [refq+ref_strideq]
movu m3, [refq+ref_strideq*2]
movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+src_strideq]
psadbw m3, [srcq+src_strideq*2]
SAD16XN 32 ; sad16x32_sse2
SAD16XN 16 ; sad16x16_sse2
SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
movhps m1, [refq+ref_strideq]
movh m2, [refq+ref_strideq*2]
movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movh m3, [srcq]
movhps m3, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
SAD8XN 16 ; sad8x16_sse2
SAD8XN 8 ; sad8x8_sse2
SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD4XN 1
-cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
movd m4, [refq+ref_stride3q]
punpckldq m1, m2
punpckldq m3, m4
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m3, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movd m2, [srcq]
movd m5, [srcq+src_strideq]
movd m4, [srcq+src_strideq*2]
INIT_MMX sse
SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
VP9_COMMON_SRCS-yes += common/vp9_idct.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
-VP9_COMMON_SRCS-yes += common/vp9_modecont.h
VP9_COMMON_SRCS-yes += common/vp9_mv.h
VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
-VP9_COMMON_SRCS-yes += common/vp9_mbpitch.c
-VP9_COMMON_SRCS-yes += common/vp9_modecont.c
VP9_COMMON_SRCS-yes += common/vp9_modecontext.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
-VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+
$(eval $(call asm_offsets_template,\
vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
img->fmt = VPX_IMG_FMT_I420;
}
img->w = yv12->y_stride;
- img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS);
+ img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3);
img->d_w = yv12->y_crop_width;
img->d_h = yv12->y_crop_height;
img->x_chroma_shift = yv12->uv_width < yv12->y_width;
VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
+VP9_CX_SRCS-yes += encoder/vp9_subexp.c
+VP9_CX_SRCS-yes += encoder/vp9_subexp.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+ifeq ($(ARCH_X86_64),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
+endif
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
+VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)
$(eval $(call asm_offsets_template,\
vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
ybf->y_height = aligned_height;
ybf->y_stride = y_stride;
+ ybf->uv_crop_width = (width + ss_x) >> ss_x;
+ ybf->uv_crop_height = (height + ss_y) >> ss_y;
ybf->uv_width = uv_width;
ybf->uv_height = uv_height;
ybf->uv_stride = uv_stride;
int uv_width;
int uv_height;
+ int uv_crop_width;
+ int uv_crop_height;
int uv_stride;
/* int uvinternal_width; */
exec_name);
fprintf(stderr, "\nOptions:\n");
- arg_show_usage(stdout, main_args);
+ arg_show_usage(stderr, main_args);
fprintf(stderr, "\nEncoder Global Options:\n");
- arg_show_usage(stdout, global_args);
+ arg_show_usage(stderr, global_args);
fprintf(stderr, "\nRate Control Options:\n");
- arg_show_usage(stdout, rc_args);
+ arg_show_usage(stderr, rc_args);
fprintf(stderr, "\nTwopass Rate Control Options:\n");
- arg_show_usage(stdout, rc_twopass_args);
+ arg_show_usage(stderr, rc_twopass_args);
fprintf(stderr, "\nKeyframe Placement Options:\n");
- arg_show_usage(stdout, kf_args);
+ arg_show_usage(stderr, kf_args);
#if CONFIG_VP8_ENCODER
fprintf(stderr, "\nVP8 Specific Options:\n");
- arg_show_usage(stdout, vp8_args);
+ arg_show_usage(stderr, vp8_args);
#endif
#if CONFIG_VP9_ENCODER
fprintf(stderr, "\nVP9 Specific Options:\n");
- arg_show_usage(stdout, vp9_args);
+ arg_show_usage(stderr, vp9_args);
#endif
fprintf(stderr, "\nStream timebase (--timebase):\n"
" The desired precision of timestamps in the output, expressed\n"