From 1eeca88691ad0fd232f110f3a389ebb494c0a6dc Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Fri, 11 Mar 2011 13:43:10 -0800
Subject: [PATCH] VP8: optimize VP8Context struct ordering

Shaves at least 3KB off code size on x86, should improve cache utilization.
This would probably be useful to do for other decoders/encoders as well.
---
 libavcodec/vp8.c |   2 +-
 libavcodec/vp8.h | 156 +++++++++++++++++++++++++++----------------------------
 2 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index de077c8..42f401d 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -474,7 +474,7 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
     enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
     int idx = CNT_ZERO;
     int cur_sign_bias = s->sign_bias[mb->ref_frame];
-    int *sign_bias = s->sign_bias;
+    int8_t *sign_bias = s->sign_bias;
     VP56mv near_mv[4];
     uint8_t cnt[4] = { 0 };
     VP56RangeCoder *c = &s->c;
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index b94d453..2db056f 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -85,83 +85,24 @@ typedef struct {
 
 typedef struct {
     AVCodecContext *avctx;
-    DSPContext dsp;
-    VP8DSPContext vp8dsp;
-    H264PredContext hpc;
-    vp8_mc_func put_pixels_tab[3][3][3];
-    AVFrame frames[4];
     AVFrame *framep[4];
     uint8_t *edge_emu_buffer;
-    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
-    int profile;
 
-    int mb_width;   /* number of horizontal MB */
-    int mb_height;  /* number of vertical MB */
+    uint16_t mb_width;   /* number of horizontal MB */
+    uint16_t mb_height;  /* number of vertical MB */
     int linesize;
     int uvlinesize;
 
-    int keyframe;
-    int invisible;
-    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
-    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
-    int update_altref;
-    int deblock_filter;
-
-    /**
-     * If this flag is not set, all the probability updates
-     * are discarded after this frame is decoded.
-     */
-    int update_probabilities;
-
-    /**
-     * All coefficients are contained in separate arith coding contexts.
-     * There can be 1, 2, 4, or 8 of these after the header context.
-     */
-    int num_coeff_partitions;
-    VP56RangeCoder coeff_partition[8];
-
-    VP8Macroblock *macroblocks;
-    VP8Macroblock *macroblocks_base;
-    VP8FilterStrength *filter_strength;
-
-    uint8_t *intra4x4_pred_mode_top;
-    uint8_t intra4x4_pred_mode_left[4];
-    uint8_t *segmentation_map;
-
-    /**
-     * Cache of the top row needed for intra prediction
-     * 16 for luma, 8 for each chroma plane
-     */
-    uint8_t (*top_border)[16+8+8];
-
-    /**
-     * For coeff decode, we need to know whether the above block had non-zero
-     * coefficients. This means for each macroblock, we need data for 4 luma
-     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
-     * per macroblock. We keep the last row in top_nnz.
-     */
-    uint8_t (*top_nnz)[9];
-    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
-
-    /**
-     * This is the index plus one of the last non-zero coeff
-     * for each of the blocks in the current macroblock.
-     * So, 0 -> no coeffs
-     *     1 -> dc-only (special transform)
-     *     2+-> full transform
-     */
-    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
-    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
-    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
-    uint8_t intra4x4_pred_mode_mb[16];
-
-    int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
-    int segment;             ///< segment of the current macroblock
+    uint8_t keyframe;
+    uint8_t deblock_filter;
+    uint8_t mbskip_enabled;
+    uint8_t segment;             ///< segment of the current macroblock
+    uint8_t chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
+    uint8_t profile;
     VP56mv mv_min;
     VP56mv mv_max;
 
-    int mbskip_enabled;
-    int sign_bias[4]; ///< one state [0, 1] per ref frame type
+    int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type
     int ref_count[3];
 
     /**
@@ -170,13 +111,26 @@ typedef struct {
      * a frame, since the values persist between interframes.
      */
     struct {
-        int enabled;
-        int absolute_vals;
-        int update_map;
+        uint8_t enabled;
+        uint8_t absolute_vals;
+        uint8_t update_map;
         int8_t base_quant[4];
         int8_t filter_level[4];     ///< base loop filter level
     } segmentation;
 
+    struct {
+        uint8_t simple;
+        uint8_t level;
+        uint8_t sharpness;
+    } filter;
+
+    VP8Macroblock *macroblocks;
+    VP8FilterStrength *filter_strength;
+
+    uint8_t *intra4x4_pred_mode_top;
+    uint8_t intra4x4_pred_mode_left[4];
+    uint8_t *segmentation_map;
+
     /**
      * Macroblocks can have one of 4 different quants in a frame when
      * segmentation is enabled.
@@ -190,13 +144,7 @@ typedef struct {
     } qmat[4];
 
     struct {
-        int simple;
-        int level;
-        int sharpness;
-    } filter;
-
-    struct {
-        int enabled;    ///< whether each mb can have a different strength based on mode/ref
+        uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
 
         /**
          * filter strength adjustment for the following macroblock modes:
@@ -220,6 +168,34 @@ typedef struct {
     } lf_delta;
 
     /**
+     * Cache of the top row needed for intra prediction
+     * 16 for luma, 8 for each chroma plane
+     */
+    uint8_t (*top_border)[16+8+8];
+
+    /**
+     * For coeff decode, we need to know whether the above block had non-zero
+     * coefficients. This means for each macroblock, we need data for 4 luma
+     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
+     * per macroblock. We keep the last row in top_nnz.
+     */
+    uint8_t (*top_nnz)[9];
+    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
+
+    /**
+     * This is the index plus one of the last non-zero coeff
+     * for each of the blocks in the current macroblock.
+     * So, 0 -> no coeffs
+     *     1 -> dc-only (special transform)
+     *     2+-> full transform
+     */
+    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
+    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
+    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
+    uint8_t intra4x4_pred_mode_mb[16];
+
+    /**
      * These are all of the updatable probabilities for binary decisions.
      * They are only implictly reset on keyframes, making it quite likely
      * for an interframe to desync if a prior frame's header was corrupt
@@ -236,6 +212,30 @@ typedef struct {
         uint8_t token[4][16][3][NUM_DCT_TOKENS-1];
         uint8_t mvc[2][19];
     } prob[2];
+
+    VP8Macroblock *macroblocks_base;
+    int invisible;
+    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
+    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
+    int update_altref;
+
+    /**
+     * If this flag is not set, all the probability updates
+     * are discarded after this frame is decoded.
+     */
+    int update_probabilities;
+
+    /**
+     * All coefficients are contained in separate arith coding contexts.
+     * There can be 1, 2, 4, or 8 of these after the header context.
+     */
+    int num_coeff_partitions;
+    VP56RangeCoder coeff_partition[8];
+    DSPContext dsp;
+    VP8DSPContext vp8dsp;
+    H264PredContext hpc;
+    vp8_mc_func put_pixels_tab[3][3][3];
+    AVFrame frames[4];
 } VP8Context;
 
 #endif
-- 
2.7.4