Use GPU to construct MFX command buffer for H264 encoding on Haswell
authorZhao Yakui <yakui.zhao@intel.com>
Fri, 8 Nov 2013 07:36:36 +0000 (15:36 +0800)
committerXiang, Haihao <haihao.xiang@intel.com>
Wed, 13 Nov 2013 07:34:07 +0000 (15:34 +0800)
This is to optimze the performance of h264 encoding. The GPU can
accelerate the construction of MFX command buffer for H264 encoding.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
(cherry picked from commit 87bc38d4300212dea51b5635f184aa1ae37fa71c)

src/gen75_mfc.c
src/shaders/utils/Makefile.am
src/shaders/utils/mfc_batchbuffer_hsw.asm [new file with mode: 0644]
src/shaders/utils/mfc_batchbuffer_hsw.g75a [new file with mode: 0644]
src/shaders/utils/mfc_batchbuffer_hsw.g75b [new file with mode: 0644]
src/shaders/utils/mfc_batchbuffer_hsw.inc [new file with mode: 0644]

index c2b26d6..784a5e2 100644 (file)
 #include "gen6_vme.h"
 #include "intel_media.h"
 
-#define MFC_SOFTWARE_HASWELL   1
+#define        AVC_INTRA_RDO_OFFSET    4
+#define        AVC_INTER_RDO_OFFSET    10
+#define        AVC_INTER_MSG_OFFSET    8
+#define        AVC_INTER_MV_OFFSET     48
+#define        AVC_RDO_MASK            0xFFFF
+
+#define        MFC_SOFTWARE_HASWELL    0
 
 #define B0_STEP_REV            2
 #define IS_STEPPING_BPLUS(i965)        ((i965->intel.revision) >= B0_STEP_REV)
 
-static const uint32_t gen75_mfc_batchbuffer_avc_intra[][4] = {
-#include "shaders/utils/mfc_batchbuffer_avc_intra.g7b"
-};
-
-static const uint32_t gen75_mfc_batchbuffer_avc_inter[][4] = {
-#include "shaders/utils/mfc_batchbuffer_avc_inter.g7b"
+static const uint32_t gen75_mfc_batchbuffer_avc[][4] = {
+#include "shaders/utils/mfc_batchbuffer_hsw.g75b"
 };
 
 static struct i965_kernel gen75_mfc_kernels[] = {
     {
         "MFC AVC INTRA BATCHBUFFER ",
         MFC_BATCHBUFFER_AVC_INTRA,
-        gen75_mfc_batchbuffer_avc_intra,
-        sizeof(gen75_mfc_batchbuffer_avc_intra),
-        NULL
-    },
-
-    {
-        "MFC AVC INTER BATCHBUFFER ",
-        MFC_BATCHBUFFER_AVC_INTER,
-        gen75_mfc_batchbuffer_avc_inter,
-        sizeof(gen75_mfc_batchbuffer_avc_inter),
+        gen75_mfc_batchbuffer_avc,
+        sizeof(gen75_mfc_batchbuffer_avc),
         NULL
     },
 };
@@ -996,7 +990,7 @@ gen75_mfc_avc_slice_state(VADriverContextP ctx,
 }
 
 
-#ifdef MFC_SOFTWARE_HASWELL
+#if MFC_SOFTWARE_HASWELL
 
 static int
 gen75_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
@@ -1147,12 +1141,6 @@ gen75_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, i
     return len_in_dwords;
 }
 
-#define                AVC_INTRA_RDO_OFFSET    4
-#define                AVC_INTER_RDO_OFFSET    10
-#define                AVC_INTER_MSG_OFFSET    8       
-#define                AVC_INTER_MV_OFFSET             48
-#define                AVC_RDO_MASK            0xFFFF
-
 static void 
 gen75_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
                                         struct encode_state *encode_state,
@@ -1306,12 +1294,6 @@ gen75_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
                                      &vme_context->vme_output,
                                      BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
                                      SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
-    assert(mfc_context->aux_batchbuffer_surface.bo);
-    mfc_context->buffer_suface_setup(ctx,
-                                     &mfc_context->gpe_context,
-                                     &mfc_context->aux_batchbuffer_surface,
-                                     BINDING_TABLE_OFFSET(BIND_IDX_MFC_SLICE_HEADER),
-                                     SURFACE_STATE_OFFSET(BIND_IDX_MFC_SLICE_HEADER));
 }
 
 static void
@@ -1322,19 +1304,10 @@ gen75_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
-    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
-    mfc_context->mfc_batchbuffer_surface.num_blocks = width_in_mbs * height_in_mbs + encode_state->num_slice_params_ext * 8 + 1;
-    mfc_context->mfc_batchbuffer_surface.size_block = 16 * CMD_LEN_IN_OWORD; /* 3 OWORDs */
-    mfc_context->mfc_batchbuffer_surface.pitch = 16;
-    mfc_context->mfc_batchbuffer_surface.bo = dri_bo_alloc(i965->intel.bufmgr, 
-                                                           "MFC batchbuffer",
-                                                           mfc_context->mfc_batchbuffer_surface.num_blocks * mfc_context->mfc_batchbuffer_surface.size_block,
-                                                           0x1000);
+    assert(mfc_context->aux_batchbuffer_surface.bo);
     mfc_context->buffer_suface_setup(ctx,
                                      &mfc_context->gpe_context,
-                                     &mfc_context->mfc_batchbuffer_surface,
+                                     &mfc_context->aux_batchbuffer_surface,
                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
 }
@@ -1401,48 +1374,47 @@ gen75_mfc_batchbuffer_constant_setup(VADriverContextP ctx,
     (void)mfc_context;
 }
 
+#define AVC_PAK_LEN_IN_BYTE    48
+#define AVC_PAK_LEN_IN_OWORD   3
+
 static void
 gen75_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
-                                          int index,
+                                          uint32_t intra_flag,
                                           int head_offset,
-                                          int batchbuffer_offset,
-                                          int head_size,
-                                          int tail_size,
                                           int number_mb_cmds,
-                                          int first_object,
-                                          int last_object,
-                                          int last_slice,
+                                          int slice_end_x,
+                                          int slice_end_y,
                                           int mb_x,
                                           int mb_y,
                                           int width_in_mbs,
-                                          int qp)
+                                          int qp,
+                                         uint32_t fwd_ref,
+                                         uint32_t bwd_ref)
 {
-    BEGIN_BATCH(batch, 12);
+    uint32_t temp_value;
+    BEGIN_BATCH(batch, 14);
     
-    OUT_BATCH(batch, CMD_MEDIA_OBJECT | (12 - 2));
-    OUT_BATCH(batch, index);
+    OUT_BATCH(batch, CMD_MEDIA_OBJECT | (14 - 2));
+    OUT_BATCH(batch, 0);
     OUT_BATCH(batch, 0);
     OUT_BATCH(batch, 0);
     OUT_BATCH(batch, 0);
     OUT_BATCH(batch, 0);
    
     /*inline data */
-    OUT_BATCH(batch, head_offset);
-    OUT_BATCH(batch, batchbuffer_offset);
-    OUT_BATCH(batch, 
-              head_size << 16 |
-              tail_size);
-    OUT_BATCH(batch,
-              number_mb_cmds << 16 |
-              first_object << 2 |
-              last_object << 1 |
-              last_slice);
-    OUT_BATCH(batch,
-              mb_y << 8 |
-              mb_x);
+    OUT_BATCH(batch, head_offset / 16);
+    OUT_BATCH(batch, (intra_flag) | (qp << 16));
+    temp_value = (mb_x | (mb_y << 8) | (width_in_mbs << 16));
+    OUT_BATCH(batch, temp_value);
+
+    OUT_BATCH(batch, number_mb_cmds);
+
     OUT_BATCH(batch,
-              qp << 16 |
-              width_in_mbs);
+              ((slice_end_y << 8) | (slice_end_x)));
+    OUT_BATCH(batch, fwd_ref);
+    OUT_BATCH(batch, bwd_ref);
+
+    OUT_BATCH(batch, MI_NOOP);
 
     ADVANCE_BATCH(batch);
 }
@@ -1452,96 +1424,83 @@ gen75_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
                                         struct intel_encoder_context *encoder_context,
                                         VAEncSliceParameterBufferH264 *slice_param,
                                         int head_offset,
-                                        unsigned short head_size,
-                                        unsigned short tail_size,
-                                        int batchbuffer_offset,
                                         int qp,
                                         int last_slice)
 {
     struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
     int total_mbs = slice_param->num_macroblocks;
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
     int number_mb_cmds = 128;
-    int starting_mb = 0;
-    int last_object = 0;
-    int first_object = 1;
+    int starting_offset = 0;
     int i;
     int mb_x, mb_y;
-    int index = (slice_param->slice_type == SLICE_TYPE_I) ? MFC_BATCHBUFFER_AVC_INTRA : MFC_BATCHBUFFER_AVC_INTER;
-
-    for (i = 0; i < total_mbs / number_mb_cmds; i++) {
-        last_object = (total_mbs - starting_mb) == number_mb_cmds;
-        mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
-        mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
-        assert(mb_x <= 255 && mb_y <= 255);
-
-        starting_mb += number_mb_cmds;
-
-        gen75_mfc_batchbuffer_emit_object_command(batch,
-                                                  index,
-                                                  head_offset,
-                                                  batchbuffer_offset,
-                                                  head_size,
-                                                  tail_size,
-                                                  number_mb_cmds,
-                                                  first_object,
-                                                  last_object,
-                                                  last_slice,
-                                                  mb_x,
-                                                  mb_y,
-                                                  width_in_mbs,
-                                                  qp);
-
-        if (first_object) {
-            head_offset += head_size;
-            batchbuffer_offset += head_size;
-        }
+    int last_mb, slice_end_x, slice_end_y;
+    int remaining_mb = total_mbs;
+    uint32_t fwd_ref , bwd_ref, mb_flag;
 
-        if (last_object) {
-            head_offset += tail_size;
-            batchbuffer_offset += tail_size;
-        }
+    last_mb = slice_param->macroblock_address + total_mbs - 1;
+    slice_end_x = last_mb % width_in_mbs;
+    slice_end_y = last_mb / width_in_mbs;
 
-        batchbuffer_offset += number_mb_cmds * CMD_LEN_IN_OWORD;
+    if (slice_type == SLICE_TYPE_I) {
+       fwd_ref = 0;
+       bwd_ref = 0;
+       mb_flag = 1;
+    } else {
+       fwd_ref = vme_context->ref_index_in_mb[0];
+       bwd_ref = vme_context->ref_index_in_mb[1];
+       mb_flag = 0;
+    }
 
-        first_object = 0;
+    if (width_in_mbs >= 100) {
+       number_mb_cmds = width_in_mbs / 5;
+    } else if (width_in_mbs >= 80) {
+       number_mb_cmds = width_in_mbs / 4;
+    } else if (width_in_mbs >= 60) {
+       number_mb_cmds = width_in_mbs / 3;
+    } else if (width_in_mbs >= 40) {
+       number_mb_cmds = width_in_mbs / 2;
+    } else {
+       number_mb_cmds = width_in_mbs;
     }
 
-    if (!last_object) {
-        last_object = 1;
-        number_mb_cmds = total_mbs % number_mb_cmds;
-        mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
-        mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
-        assert(mb_x <= 255 && mb_y <= 255);
-        starting_mb += number_mb_cmds;
+    do {
+       if (number_mb_cmds >= remaining_mb) {
+               number_mb_cmds = remaining_mb;
+       }
+       mb_x = (slice_param->macroblock_address + starting_offset) % width_in_mbs;
+       mb_y = (slice_param->macroblock_address + starting_offset) / width_in_mbs;
 
         gen75_mfc_batchbuffer_emit_object_command(batch,
-                                                  index,
+                                                 mb_flag,
                                                   head_offset,
-                                                  batchbuffer_offset,
-                                                  head_size,
-                                                  tail_size,
                                                   number_mb_cmds,
-                                                  first_object,
-                                                  last_object,
-                                                  last_slice,
+                                                 slice_end_x,
+                                                 slice_end_y,
                                                   mb_x,
                                                   mb_y,
                                                   width_in_mbs,
-                                                  qp);
-    }
+                                                  qp,
+                                                 fwd_ref,
+                                                 bwd_ref);
+
+       head_offset += (number_mb_cmds * AVC_PAK_LEN_IN_BYTE);
+       remaining_mb -= number_mb_cmds;
+       starting_offset += number_mb_cmds;
+    } while (remaining_mb > 0);
 }
                           
 /*
  * return size in Owords (16bytes)
  */         
-static int
+static void
 gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
                                 struct encode_state *encode_state,
                                 struct intel_encoder_context *encoder_context,
-                                int slice_index,
-                                int batchbuffer_offset)
+                                int slice_index)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
@@ -1557,8 +1516,6 @@ gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
     int slice_header_length_in_bits = 0;
     unsigned int tail_data[] = { 0x0, 0x0 };
     long head_offset;
-    int old_used = intel_batchbuffer_used_size(slice_batch), used;
-    unsigned short head_size, tail_size;
     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
 
     if (rate_control_mode == VA_RC_CBR) {
@@ -1572,7 +1529,6 @@ gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
     assert(qp >= 0 && qp < 52);
 
-    head_offset = old_used / 16;
     gen75_mfc_avc_slice_state(ctx,
                               pPicParameter,
                               pSliceParameter,
@@ -1601,11 +1557,20 @@ gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
     free(slice_header);
 
     intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
-    used = intel_batchbuffer_used_size(slice_batch);
-    head_size = (used - old_used) / 16;
-    old_used = used;
+    head_offset = intel_batchbuffer_used_size(slice_batch);
+
+    slice_batch->ptr += pSliceParameter->num_macroblocks * AVC_PAK_LEN_IN_BYTE;
+
+    gen75_mfc_avc_batchbuffer_slice_command(ctx,
+                                            encoder_context,
+                                            pSliceParameter,
+                                            head_offset,
+                                            qp,
+                                            last_slice);
+
 
-    /* tail */
+    /* Aligned for tail */
+    intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
     if (last_slice) {    
         mfc_context->insert_object(ctx,
                                    encoder_context,
@@ -1630,22 +1595,7 @@ gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
                                    slice_batch);
     }
 
-    intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
-    used = intel_batchbuffer_used_size(slice_batch);
-    tail_size = (used - old_used) / 16;
-
-   
-    gen75_mfc_avc_batchbuffer_slice_command(ctx,
-                                            encoder_context,
-                                            pSliceParameter,
-                                            head_offset,
-                                            head_size,
-                                            tail_size,
-                                            batchbuffer_offset,
-                                            qp,
-                                            last_slice);
-
-    return head_size + tail_size + pSliceParameter->num_macroblocks * CMD_LEN_IN_OWORD;
+    return;
 }
 
 static void
@@ -1660,10 +1610,16 @@ gen75_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
     gen6_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
 
     for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
-        size = gen75_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i, offset);
-        offset += size;
+        gen75_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i);
+    }
+    {
+       struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
+       intel_batchbuffer_align(slice_batch, 8);
+       BEGIN_BCS_BATCH(slice_batch, 2);
+       OUT_BCS_BATCH(slice_batch, 0);
+       OUT_BCS_BATCH(slice_batch, MI_BATCH_BUFFER_END);
+       ADVANCE_BCS_BATCH(slice_batch);
     }
-
     intel_batchbuffer_end_atomic(batch);
     intel_batchbuffer_flush(batch);
 }
@@ -1687,9 +1643,9 @@ gen75_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
 
     gen75_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
-    dri_bo_reference(mfc_context->mfc_batchbuffer_surface.bo);
+    dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
 
-    return mfc_context->mfc_batchbuffer_surface.bo;
+    return mfc_context->aux_batchbuffer_surface.bo;
 }
 
 #endif
@@ -1708,7 +1664,7 @@ gen75_mfc_avc_pipeline_programing(VADriverContextP ctx,
         return; 
     }
 
-#ifdef MFC_SOFTWARE_HASWELL
+#if MFC_SOFTWARE_HASWELL
     slice_batch_bo = gen75_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
 #else
     slice_batch_bo = gen75_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
@@ -2612,7 +2568,7 @@ Bool gen75_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *
     i965_gpe_load_kernels(ctx,
                           &mfc_context->gpe_context,
                           gen75_mfc_kernels,
-                          NUM_MFC_KERNEL);
+                          1);
 
     mfc_context->pipe_mode_select = gen75_mfc_pipe_mode_select;
     mfc_context->set_surface_state = gen75_mfc_surface_state;
index b8f3121..dd19d62 100644 (file)
@@ -6,6 +6,9 @@ MFC_CORE_AVC    = \
        mfc_batchbuffer_avc_intra.asm           \
        mfc_batchbuffer_avc_inter.asm
 
+MFC_CORE_HSW   = \
+       mfc_batchbuffer_hsw.asm
+
 INTEL_G6B      = mfc_batchbuffer_avc_intra.g6b mfc_batchbuffer_avc_inter.g6b
 INTEL_G6A      = mfc_batchbuffer_avc_intra.g6a mfc_batchbuffer_avc_inter.g6a
 INTEL_GEN6_INC = mfc_batchbuffer.inc
@@ -16,15 +19,21 @@ INTEL_G7A   = mfc_batchbuffer_avc_intra.g7a mfc_batchbuffer_avc_inter.g7a
 INTEL_GEN7_INC = mfc_batchbuffer.inc
 INTEL_GEN7_ASM = $(INTEL_G7A:%.g7a=%.gen7.asm)
 
+INTEL_G75B     = mfc_batchbuffer_hsw.g75b
+INTEL_G75A     = mfc_batchbuffer_hsw.g75a
+INTEL_GEN75_INC        = mfc_batchbuffer_hsw.inc
+INTEL_GEN75_ASM        = $(INTEL_G75A:%.g75a=%.gen75.asm)
+
 TARGETS  =
 if HAVE_GEN4ASM
 TARGETS += $(INTEL_G6B)
 TARGETS += $(INTEL_G7B)
+TARGETS += $(INTEL_G75B)
 endif
 
 all-local: $(TARGETS)
 
-SUFFIXES = .g6a .g6b .g7a .g7b .gen6.asm .gen7.asm
+SUFFIXES = .g6a .g6b .g7a .g7b .gen6.asm .gen7.asm .g75a .g75b .gen75.asm
 
 if HAVE_GEN4ASM
 $(INTEL_GEN6_ASM): $(MFC_CORE) $(MFC_CORE_AVC) $(INTEL_GEN6_INC)
@@ -42,19 +51,31 @@ $(INTEL_GEN7_ASM): $(MFC_CORE) $(MFC_CORE_AVC) $(INTEL_GEN7_INC)
        rm _mfc0.$@
 .gen7.asm.g7b:
        $(AM_V_GEN)$(GEN4ASM) -g 7 -o $@ $<
+
+$(INTEL_GEN75_ASM): $(MFC_CORE_HSW) $(INTEL_GEN75_INC)
+.g75a.gen75.asm:
+       $(AM_V_GEN)cpp -P $< > _mfc0.$@                 && \
+       m4 _mfc0.$@ > $@                                && \
+       rm _mfc0.$@
+.gen75.asm.g75b:
+       $(AM_V_GEN)$(GEN4ASM) -g 7.5 -o $@ $<
 endif
 
-CLEANFILES = $(INTEL_GEN6_ASM) $(INTEL_GEN7_ASM)
+CLEANFILES = $(INTEL_GEN6_ASM) $(INTEL_GEN7_ASM) $(INTEL_GEN75_ASM)
 
 EXTRA_DIST = \
        $(INTEL_G6A)            \
        $(INTEL_G6B)            \
        $(INTEL_G7A)            \
        $(INTEL_G7B)            \
+       $(INTEL_G75A)           \
+       $(INTEL_G75B)           \
        $(INTEL_GEN6_INC)       \
        $(INTEL_GEN7_INC)       \
+       $(INTEL_GEN75_INC)      \
        $(MFC_CORE)             \
        $(MFC_CORE_AVC)         \
+       $(MFC_CORE_HSW)         \
        $(NULL)
 
 # Extra clean files so that maintainer-clean removes *everything*
diff --git a/src/shaders/utils/mfc_batchbuffer_hsw.asm b/src/shaders/utils/mfc_batchbuffer_hsw.asm
new file mode 100644 (file)
index 0000000..c34e934
--- /dev/null
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2010-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao@intel.com>
+ */
+        
+START:
+       mov     (16)    pak_object_reg0.0<1>:ud 0x0:ud          {align1}; 
+       mov     (8)     obw_m0.0<1>:ud          0x0:ud          {align1}; 
+       mov     (8)     mb_cur_msg.0<1>:ud      0x0:ud          {align1}; 
+       mov     (16)    mb_temp.0<1>:ud         0x0:ud          {align1}; 
+       mov     (1)     cur_mb_x<1>:uw          mb_x<0,1,0>:ub  {align1};
+       mov     (1)     cur_mb_y<1>:uw          mb_y<0,1,0>:ub  {align1};
+       mov     (1)     end_mb_x<1>:uw  slice_end_x<0,1,0>:ub   {align1};
+       mov     (1)     end_mb_y<1>:uw  slice_end_y<0,1,0>:ub   {align1};
+       mov     (1)     end_loop_count<1>:uw    total_mbs<0,1,0>:uw     {align1};
+       mov     (1)     vme_len<1>:ud           2:ud            {align1};
+       and.z.f0.0      (1) null:uw     mb_flag<0,1,0>:ub       INTRA_SLICE:uw {align1};
+       (f0.0)  mov     (1) vme_len<1>:ud       24:ud           {align1};
+
+       mov  (1) obw_m0.8<1>:UD         buffer_offset<0,1,0>:ud {align1};
+       mov  (1) obw_m0.20<1>:UB        thread_id_ub {align1};    /* dispatch id */
+
+       mul  (1) mb_cur_msg.8<1>:UD       width_in_mbs<0,1,0>:UW   cur_mb_y<0,1,0>:UW {align1};
+       add  (1) mb_cur_msg.8<1>:UD       mb_cur_msg.8<0,1,0>:UD   cur_mb_x<0,1,0>:uw {align1};
+       mul  (1) mb_cur_msg.8<1>:UD       mb_cur_msg.8<0,1,0>:UD vme_len<0,1,0>:UD {align1};
+       mov  (1) mb_cur_msg.20<1>:UB      thread_id_ub {align1};                  /* dispatch id */
+       mov  (1) pak_object0_ud<1>:ud      MFC_AVC_PAK_OBJECT_DW0:ud {align1};
+       mov  (1) pak_object5_ud<1>:ud      MFC_AVC_PAK_OBJECT_DW5:ud {align1};
+       mov  (1) pak_object10_ud<1>:ud     MFC_AVC_PAK_OBJECT_DW10:ud {align1};
+       mov  (1) pak_object6_ud<1>:ub      qp_flag<0,1,0>:ub {align1};
+
+pak_object_loop:
+       mov     (8)     mb_msg0.0<1>:ud  mb_cur_msg.0<8,8,1>:ud {align1};
+       mov     (1)     pak_object4_ud<1>:ud MFC_AVC_PAK_OBJECT_DW4:ud {align1};
+       mov     (1)     tmp_reg0.0<1>:ub  cur_mb_x<0,1,0>:ub    {align1};
+       mov     (1)     tmp_reg0.1<1>:ub  cur_mb_y<0,1,0>:ub    {align1};
+       mov     (1)     pak_object4_ud<1>:uw tmp_reg0.0<0,1,0>:uw {align1};
+       /* pak_object6_ud */
+       mov     (1)     pak_object_reg0.26<1>:uw        0x0:uw  {align1};
+
+       cmp.e.f0.0 (1)  null:uw cur_mb_x<0,1,0>:uw end_mb_x<0,1,0>:uw   {align1};
+       (-f0.0) jmpi    (1) start_mb_flag;      
+       cmp.e.f0.0 (1)  null:uw cur_mb_y<0,1,0>:uw end_mb_y<0,1,0>:uw   {align1};
+       (f0.0)  mov     (1)  pak_object_reg0.26<1>:uw MFC_AVC_PAK_LAST_MB:uw {align1};  
+start_mb_flag:
+       and.z.f0.0      (1) null:uw     mb_flag<0,1,0>:ub       INTRA_SLICE:uw {align1};
+       (f0.0)  jmpi    (1) inter_frame_start;
+       
+/* bind index 0, read 2 oword (32bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+        null
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_2,
+                MV_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 1
+        {align1};      
+       jmpi (1) intra_pak_command;
+
+nop;
+nop;
+inter_frame_start:
+/* bind index 0, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+        null
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_4,
+                MV_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 2
+        {align1};
+               
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1)         null:w  mb_intra_wb.16<0,1,0>:uw        mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0)   jmpi  (1)     intra_pak_command;
+
+/* MV len and MV mode */       
+       and     (1)   pak_object3_ud<1>:ud mb_inter_wb.0<0,1,0>:ud MFC_AVC_INTER_MASK_DW3:ud {align1};
+       add     (1)   pak_object3_ud<1>:ud pak_object3_ud<0,1,0>:ud MFC_AVC_PAK_CBP:ud {align1}; 
+       and     (1)   tmp_reg0.0<1>:uw   mb_inter_wb.0<0,1,0>:uw        INTER_MASK:uw   {align1};
+       mov     (1)   pak_object1_ud<1>:ud      32:ud   {align1};
+       cmp.e.f0.0 (1) null:uw  tmp_reg0.0<0,1,0>:uw    INTER_8X8MODE:uw        {align1};
+       (-f0.0) add (1) pak_object3_ud<1>:ud  pak_object3_ud<0,1,0>:ud  INTER_MV8:ud {align1};
+       (-f0.0) jmpi (1)        inter_mv_check;
+       and.nz.f0.0 (1) null:ud  mb_inter_wb.4<0,1,0>:uw        SUBSHAPE_MASK:uw {align1};
+       (f0.0)  mov  (1)        pak_object1_ud<1>:ud    128:ud  {align1};
+       (f0.0)  add (1) pak_object3_ud<1>:ud  pak_object3_ud<0,1,0>:ud  INTER_MV32:ud {align1};
+       (f0.0)  jmpi    (1) mv_check_end;
+
+       add (1) pak_object3_ud<1>:ud  pak_object3_ud<0,1,0>:ud  INTER_MV8:ud {align1};
+               
+inter_mv_check:
+       and     (1)   tmp_reg0.0<1>:uw   mb_inter_wb.0<0,1,0>:uw        INTER_MASK:uw   {align1};
+       cmp.e.f0.0 (1) null:uw  tmp_reg0.0<0,1,0>:uw    INTER_16X16MODE:uw      {align1};
+       (f0.0)  jmpi    (1) mv_check_end;
+       
+add   (1) mb_msg0.8<1>:UD      mb_msg0.8<0,1,0>:ud     3:ud {align1};
+/* Read MV for MB A */
+/* bind index 0, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_mv0.0<1>:ud
+        null
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_8,
+                MV_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 4
+        {align1};
+/* TODO: RefID is required after multi-references are added */
+
+       mov     (2)     mb_mv0.8<1>:ud  mb_mv1.0<2,2,1>:ud      {align1};
+       mov     (2)     mb_mv0.16<1>:ud mb_mv2.0<2,2,1>:ud      {align1};
+       mov     (2)     mb_mv0.24<1>:ud mb_mv3.0<2,2,1>:ud      {align1};
+
+        mov             (8)     msg_reg0.0<1>:ud                mb_msg0.0<8,8,1>:ud {align1} ;
+        mov             (8)     msg_reg1.0<1>:ud                mb_mv0.0<8,8,1>:ud {align1} ;
+/* Write MV for MB A */
+/* bind index 0, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_2,
+                MV_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+mv_check_end:
+
+/* ref list */
+       mov     (1)   pak_object8_ud<1>:ud fwd_ref<0,1,0>:ud    {align1};
+       mov     (1)   pak_object9_ud<1>:ud bwd_ref<0,1,0>:ud    {align1};
+/* inter_mode. pak_object7_ud */
+       mov     (1)   pak_object7_ud<1>:ud      0x0:ud  {align1};
+       mov     (1)   pak_object_reg0.28<1>:ub mb_inter_wb.5<0,1,0>:ub  {align1};
+       mov     (1)   pak_object_reg0.29<1>:ub mb_inter_wb.6<0,1,0>:ub  {align1};
+
+/* mv start address */
+       add     (1)   tmp_reg0.4<1>:ud  mb_cur_msg.8<0,1,0>:ud  3:ud {align1};
+       mul     (1)   pak_object2_ud<1>:ud tmp_reg0.4<0,1,0>:ud 16:ud {align1}; 
+
+        jmpi   (1)     write_pak_command;
+
+intra_pak_command:
+       /* object 1/2 is set to zero */
+       mov     (2)   pak_object1_ud<1>:ud      0x0:ud  {align1};
+       /* object 7/8 intra mode */
+       mov     (1)   pak_object7_ud<1>:ud      mb_intra_wb.4<0,1,0>:ud {align1};
+       mov     (1)   pak_object8_ud<1>:ud      mb_intra_wb.8<0,1,0>:ud {align1};
+       /* object 9 Intra structure */
+       mov     (1)   pak_object9_ud<1>:ud      0x0:ud                  {align1};
+       mov     (1)   pak_object9_ud<1>:ub      mb_intra_wb.12<0,1,0>:ub {align1};
+
+       and     (1)   pak_object3_ud<1>:ud mb_intra_wb.0<0,1,0>:ud MFC_AVC_INTRA_MASK_DW3:ud {align1};
+       add     (1)   pak_object3_ud<1>:ud pak_object3_ud<0,1,0>:ud MFC_AVC_INTRA_FLAG + MFC_AVC_PAK_CBP:ud {align1};
+
+       mov     (1)   tmp_reg0.0<1>:ud  0:ud    {align1};
+       mov     (1)   tmp_reg0.1<1>:ub  mb_intra_wb.2<0,1,0>:ub {align1};
+       and     (1)   tmp_reg0.0<1>:uw  tmp_reg0.0<0,1,0>:uw    AVC_INTRA_MASK:uw {align1};
+       add     (1)   pak_object3_ud<1>:ud pak_object3_ud<0,1,0>:ud tmp_reg0.0<0,1,0>:ud {align1};
+
+/* Write the pak command into the batchbuffer */
+write_pak_command:
+        mov             (8)     msg_reg0.0<1>:ud                obw_m0.0<8,8,1>:ud {align1} ;
+        mov             (8)     msg_reg1.0<1>:ud                pak_object_reg0.0<8,8,1>:ud {align1} ;
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_2,
+                MFC_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+       add     (1)     msg_reg0.8<1>:ud        msg_reg0.8<0,1,0>:ud    2:ud    {align1};
+       mov     (8)     msg_reg1.0<1>:ud        pak_object_reg1.0<8,8,1>:ud {align1};
+
+/* bind index 3, write 1 oword (16bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_0,
+                MFC_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+
+/* Check the next mb */
+add    (1)     cur_loop_count<1>:uw    cur_loop_count<0,1,0>:uw        1:uw    {align1};
+cmp.e.f0.0     (1)     null:uw cur_loop_count<0,1,0>:uw end_loop_count<0,1,0>:uw {align1};
+(f0.0) jmpi    (1)     pak_loop_end;
+/* the buffer offset for next block */
+add     (1)    obw_m0.8<1>:ud          obw_m0.8<0,1,0>:ud      3:uw    {align1};
+add    (1)     mb_cur_msg.8<1>:ud      mb_cur_msg.8<0,1,0>:ud  vme_len<0,1,0>:ud {align1};             
+add    (1)     cur_mb_x<1>:uw          cur_mb_x<0,1,0>:uw      1:uw    {align1};
+/* Check whether it is already equal to width in mbs */
+cmp.e.f0.0     (1)     null:uw         cur_mb_x<0,1,0>:uw      width_in_mbs<0,1,0>:uw  {align1};
+(f0.0) add (1) cur_mb_y<1>:uw          cur_mb_y<0,1,0>:uw      1:uw    {align1};
+(f0.0) mov     (1) cur_mb_x<1>:uw      0:uw            {align1};               
+
+/* continue the pak command for next mb */
+jmpi   (1)     pak_object_loop;
+nop;
+nop;
+pak_loop_end:
+/* Issue message fence so that the previous write message is committed */
+send (16)
+        msg_ind
+        mb_wb.0<1>:ud
+        null
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_FENCE,
+                OBR_MF_COMMIT,
+                MFC_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 1
+        {align1};
+
+__EXIT: 
+/*
+ * kill thread
+ */        
+mov  (8) ts_msg_reg0<1>:UD         r0<8,8,1>:UD {align1};
+send (1) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
+
+nop;
+        
diff --git a/src/shaders/utils/mfc_batchbuffer_hsw.g75a b/src/shaders/utils/mfc_batchbuffer_hsw.g75a
new file mode 100644 (file)
index 0000000..4a96754
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * Copyright © 2010-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao@intel.com>
+ */
+
+#include "mfc_batchbuffer_hsw.inc"
+#include "mfc_batchbuffer_hsw.asm"
+
diff --git a/src/shaders/utils/mfc_batchbuffer_hsw.g75b b/src/shaders/utils/mfc_batchbuffer_hsw.g75b
new file mode 100644 (file)
index 0000000..2f42643
--- /dev/null
@@ -0,0 +1,105 @@
+   { 0x00800001, 0x23400061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x21e00061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x2b000061, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x2ac00061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x2ac00229, 0x000000a8, 0x00000000 },
+   { 0x00000001, 0x2ac20229, 0x000000a9, 0x00000000 },
+   { 0x00000001, 0x2ae00229, 0x000000b0, 0x00000000 },
+   { 0x00000001, 0x2ae20229, 0x000000b1, 0x00000000 },
+   { 0x00000001, 0x2ae40129, 0x000000ac, 0x00000000 },
+   { 0x00000001, 0x2ae80061, 0x00000000, 0x00000002 },
+   { 0x01000005, 0x20002e28, 0x000000a4, 0x00010001 },
+   { 0x00010001, 0x2ae80061, 0x00000000, 0x00000018 },
+   { 0x00000001, 0x21e80021, 0x000000a0, 0x00000000 },
+   { 0x00000001, 0x21f40231, 0x00000014, 0x00000000 },
+   { 0x00000041, 0x2b082521, 0x000000aa, 0x00000ac2 },
+   { 0x00000040, 0x2b082421, 0x00000b08, 0x00000ac0 },
+   { 0x00000041, 0x2b080421, 0x00000b08, 0x00000ae8 },
+   { 0x00000001, 0x2b140231, 0x00000014, 0x00000000 },
+   { 0x00000001, 0x23400061, 0x00000000, 0x7149000a },
+   { 0x00000001, 0x23540061, 0x00000000, 0x000f000f },
+   { 0x00000001, 0x23680061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x23580231, 0x000000a6, 0x00000000 },
+   { 0x00600001, 0x2b400021, 0x008d0b00, 0x00000000 },
+   { 0x00000001, 0x23500061, 0x00000000, 0xffff0000 },
+   { 0x00000001, 0x21000231, 0x00000ac0, 0x00000000 },
+   { 0x00000001, 0x21010231, 0x00000ac2, 0x00000000 },
+   { 0x00000001, 0x23500129, 0x00000100, 0x00000000 },
+   { 0x00000001, 0x235a0169, 0x00000000, 0x00000000 },
+   { 0x01000010, 0x20002528, 0x00000ac0, 0x00000ae0 },
+   { 0x00110020, 0x34001c00, 0x00001400, 0x00000020 },
+   { 0x01000010, 0x20002528, 0x00000ac2, 0x00000ae2 },
+   { 0x00010001, 0x235a0169, 0x00000000, 0x04000400 },
+   { 0x01000005, 0x20002e28, 0x000000a4, 0x00010001 },
+   { 0x00010020, 0x34001c00, 0x00001400, 0x00000040 },
+   { 0x0a800031, 0x2b601ca1, 0x00000b40, 0x02180200 },
+   { 0x00000020, 0x34001c00, 0x00001400, 0x00000240 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0a800031, 0x2b601ca1, 0x00000b40, 0x02280300 },
+   { 0x05000010, 0x2000252c, 0x00000b70, 0x00000b88 },
+   { 0x00010020, 0x34001c00, 0x00001400, 0x000001f0 },
+   { 0x00000005, 0x234c0c21, 0x00000b80, 0x1f00ffff },
+   { 0x00000040, 0x234c0c21, 0x0000034c, 0x000e0000 },
+   { 0x00000005, 0x21002d29, 0x00000b80, 0x00030003 },
+   { 0x00000001, 0x23440061, 0x00000000, 0x00000020 },
+   { 0x01000010, 0x20002d28, 0x00000100, 0x00030003 },
+   { 0x00110040, 0x234c0c21, 0x0000034c, 0x00400000 },
+   { 0x00110020, 0x34001c00, 0x00001400, 0x00000050 },
+   { 0x02000005, 0x20002d20, 0x00000b84, 0xff00ff00 },
+   { 0x00010001, 0x23440061, 0x00000000, 0x00000080 },
+   { 0x00010040, 0x234c0c21, 0x0000034c, 0x00600000 },
+   { 0x00010020, 0x34001c00, 0x00001400, 0x000000c0 },
+   { 0x00000040, 0x234c0c21, 0x0000034c, 0x00400000 },
+   { 0x00000005, 0x21002d29, 0x00000b80, 0x00030003 },
+   { 0x01000010, 0x20002d28, 0x00000100, 0x00000000 },
+   { 0x00010020, 0x34001c00, 0x00001400, 0x00000080 },
+   { 0x00000040, 0x2b480c21, 0x00000b48, 0x00000003 },
+   { 0x0a800031, 0x2ba01ca1, 0x00000b40, 0x02480400 },
+   { 0x00200001, 0x2ba80021, 0x00450bc0, 0x00000000 },
+   { 0x00200001, 0x2bb00021, 0x00450be0, 0x00000000 },
+   { 0x00200001, 0x2bb80021, 0x00450c00, 0x00000000 },
+   { 0x00600001, 0x28000021, 0x008d0b40, 0x00000000 },
+   { 0x00600001, 0x28200021, 0x008d0ba0, 0x00000000 },
+   { 0x0a800031, 0x20001cac, 0x00000800, 0x040a0200 },
+   { 0x00000001, 0x23600021, 0x000000b4, 0x00000000 },
+   { 0x00000001, 0x23640021, 0x000000b8, 0x00000000 },
+   { 0x00000001, 0x235c0061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x235c0231, 0x00000b85, 0x00000000 },
+   { 0x00000001, 0x235d0231, 0x00000b86, 0x00000000 },
+   { 0x00000040, 0x21040c21, 0x00000b08, 0x00000003 },
+   { 0x00000041, 0x23480c21, 0x00000104, 0x00000010 },
+   { 0x00000020, 0x34001c00, 0x00001400, 0x000000b0 },
+   { 0x00200001, 0x23440061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x235c0021, 0x00000b64, 0x00000000 },
+   { 0x00000001, 0x23600021, 0x00000b68, 0x00000000 },
+   { 0x00000001, 0x23640061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x23640231, 0x00000b6c, 0x00000000 },
+   { 0x00000005, 0x234c0c21, 0x00000b60, 0x0000c0ff },
+   { 0x00000040, 0x234c0c21, 0x0000034c, 0x000e2000 },
+   { 0x00000001, 0x21000061, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x21010231, 0x00000b62, 0x00000000 },
+   { 0x00000005, 0x21002d29, 0x00000100, 0x1f001f00 },
+   { 0x00000040, 0x234c0421, 0x0000034c, 0x00000100 },
+   { 0x00600001, 0x28000021, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x28200021, 0x008d0340, 0x00000000 },
+   { 0x0a800031, 0x20001cac, 0x00000800, 0x040a0202 },
+   { 0x00000040, 0x28080c21, 0x00000808, 0x00000002 },
+   { 0x00600001, 0x28200021, 0x008d0360, 0x00000000 },
+   { 0x0a800031, 0x20001cac, 0x00000800, 0x040a0002 },
+   { 0x00000040, 0x2ac42d29, 0x00000ac4, 0x00010001 },
+   { 0x01000010, 0x20002528, 0x00000ac4, 0x00000ae4 },
+   { 0x00010020, 0x34001c00, 0x00001400, 0x00000090 },
+   { 0x00000040, 0x21e82c21, 0x000001e8, 0x00030003 },
+   { 0x00000040, 0x2b080421, 0x00000b08, 0x00000ae8 },
+   { 0x00000040, 0x2ac02d29, 0x00000ac0, 0x00010001 },
+   { 0x01000010, 0x20002528, 0x00000ac0, 0x000000aa },
+   { 0x00010040, 0x2ac22d29, 0x00000ac2, 0x00010001 },
+   { 0x00010001, 0x2ac00169, 0x00000000, 0x00000000 },
+   { 0x00000020, 0x34001c00, 0x00001400, 0xfffffb30 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0a800031, 0x2b601ca1, 0x00000800, 0x0219e002 },
+   { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+   { 0x07000031, 0x24001ca8, 0x00000e00, 0x82000010 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/shaders/utils/mfc_batchbuffer_hsw.inc b/src/shaders/utils/mfc_batchbuffer_hsw.inc
new file mode 100644 (file)
index 0000000..588006e
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2010-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao@intel.com>
+ */
+
+/* GRF registers
+ * r0 header
+ * r1~r4 constant buffer (reserved)
+ * r5 inline data
+ * r6~r7 reserved      
+ * r8~r15 temporary registers
+ * r16 write back of Oword Block Write
+ */
+
+/*
+ * GRF 0 -- header       
+ */        
+define(`thread_id_ub',          `r0.20<0,1,0>:UB')  /* thread id in payload */
+
+define(`inline_reg0',           `r5')
+define(`buffer_offset',                `inline_reg0.0') /* :ud, in units of Owords */
+/* :ub,
+ * bit0 indicates the frame type. 1 is the I-frame. 0 is P-B frame
+ */
+define(`mb_flag',              `inline_reg0.4')
+define(`qp_flag',              `inline_reg0.6') /* :ub */
+
+define(`mb_x',                  `inline_reg0.8') /* :ub, */
+define(`mb_y',                  `inline_reg0.9') /* :ub, */
+define(`mb_xy',                 `inline_reg0.8') /* :uw, */      
+/* :uw, the picture width in macroblocks */
+define(`width_in_mbs',           `inline_reg0.10') 
+/* :w, the number of macroblock commands being processed by the kernel */
+define(`total_mbs',             `inline_reg0.12')
+/* ub, the mb x/y of the last mb in slice */
+define(`slice_end_x',           `inline_reg0.16')
+define(`slice_end_y',           `inline_reg0.17')
+
+/* :ud the forward reference picture list */
+define(`fwd_ref',              `inline_reg0.20')
+/* :ud the backward reference picture list */
+define(`bwd_ref',              `inline_reg0.24')
+
+/*
+ * GRF 8~15 -- temporary registers
+ */
+define(`tmp_reg0',              `r8')
+define(`tmp_reg1',              `r9')
+define(`tmp_reg2',              `r10')
+define(`tmp_reg3',              `r11')
+define(`tmp_reg4',              `r12')
+define(`tmp_reg5',              `r13')
+define(`tmp_reg6',              `r14')
+define(`tmp_reg7',              `r15')
+
+define(`obw_m0',                `tmp_reg7')
+
+define(`obw_wb',                `null<1>:W')
+define(`obw_wb_length',         `0')
+
+/*
+ * GRF 26~27 
+ */
+define(`pak_object_reg0',     `r26')
+define(`pak_object0_ud',      `r26.0')
+define(`pak_object1_ud',      `r26.4')
+define(`pak_object2_ud',      `r26.8')
+define(`pak_object3_ud',      `r26.12')
+define(`pak_object4_ud',      `r26.16')
+define(`pak_object5_ud',      `r26.20')
+define(`pak_object6_ud',      `r26.24')
+define(`pak_object7_ud',      `r26.28')
+
+define(`pak_object_reg1',     `r27')
+define(`pak_object8_ud',      `r27.0')
+define(`pak_object9_ud',      `r27.4')
+define(`pak_object10_ud',     `r27.8')
+define(`pak_object11_ud',     `r27.12')
+
+/*
+ * Message Payload registers
+ */
+define(`msg_ind',               `64')
+define(`msg_reg0',              `g64')
+define(`msg_reg1',              `g65')
+define(`msg_reg2',              `g66')
+define(`msg_reg3',              `g67')
+define(`msg_reg4',              `g68')
+define(`msg_reg5',              `g69')
+define(`msg_reg6',              `g70')
+define(`msg_reg7',              `g71')
+define(`msg_reg8',              `g72')
+
+define(`MV_BIND_IDX',           `0')
+define(`MFC_BIND_IDX',          `2')
+
+define(`ts_msg_ind',               `112')
+define(`ts_msg_reg0',               `r112')
+
+
+define(`MFC_AVC_PAK_OBJECT_DW0',  `0x7149000a')
+define(`MFC_AVC_PAK_OBJECT_DW4',  `0xFFFF0000')        /* CBP for Y */
+define(`MFC_AVC_PAK_OBJECT_DW5',  `0x000F000F')
+define(`MFC_AVC_PAK_OBJECT_DW10', `0x0000000')
+
+define(`OBR_MESSAGE_TYPE',              `0')
+define(`OBR_CACHE_TYPE',                `10')
+
+define(`OBR_MESSAGE_FENCE',              `7')
+define(`OBR_MF_NOCOMMIT',               `0')
+define(`OBR_MF_COMMIT',                         `0x20')
+
+define(`OBR_CONTROL_0',                 `0')    /* 1 OWord, low 128 bits */
+define(`OBR_CONTROL_1',                 `1')    /* 1 OWord, high 128 bits */
+define(`OBR_CONTROL_2',                 `2')    /* 2 OWords */
+define(`OBR_CONTROL_4',                 `3')    /* 4 OWords */
+define(`OBR_CONTROL_8',                 `4')    /* 8 OWords */
+
+define(`OBR_HEADER_PRESENT',            `1')
+define(`OBR_WRITE_COMMIT_CATEGORY',     `0')    /* category on Ivybridge */
+
+define(`OBW_WRITE_COMMIT_CATEGORY',     `0')    /* category on Ivybridge */
+
+define(`OBW_CACHE_TYPE',                `10')
+
+
+define(`OBW_MESSAGE_TYPE',              `8')
+
+define(`OBW_CONTROL_0',                 `0')    /* 1 OWord, low 128 bits */
+define(`OBW_CONTROL_1',                 `1')    /* 1 OWord, high 128 bits */
+define(`OBW_CONTROL_2',                 `2')    /* 2 OWords */
+define(`OBW_CONTROL_4',                 `3')    /* 4 OWords */
+define(`OBW_CONTROL_8',                 `4')    /* 8 OWords */
+define(`OBW_HEADER_PRESENT',            `1')
+
+define(`INTER_MASK',                   `0x03')
+define(`INTER_16X16MODE',              `0x0')
+define(`INTER_16X8MODE',               `0x01')
+define(`INTER_8X16MODE',               `0x02')
+define(`INTER_8X8MODE',                        `0x03')
+define(`SUBSHAPE_MASK',                `0xFF00')
+
+define(`mb_ind',                `90')
+define(`mb_msg0',              `r90')
+define(`mb_wb',                        `r91')
+define(`mb_intra_wb',          `r91')
+define(`mb_inter_wb',          `r92')
+define(`mb_mv0',               `r93')
+define(`mb_mv1',               `r94')
+define(`mb_mv2',               `r95')
+define(`mb_mv3',               `r96')
+
+define(`mb_temp',              `r86')
+define(`cur_mb_x',              `mb_temp.0') /* :uw, */
+define(`cur_mb_y',              `mb_temp.2') /* :uw, */
+define(`cur_loop_count',        `mb_temp.4') /* :uw, */
+define(`mb_end',               `r87')
+define(`end_mb_x',              `mb_end.0') /* :uw, */
+define(`end_mb_y',              `mb_end.2') /* :uw, */
+define(`end_loop_count',        `mb_end.4') /* :uw, */
+/* :ud the length of VME predict result for every mb. Units in owords */
+define(`vme_len',               `mb_end.8') 
+define(`mb_cur_msg',           `r88')
+
+define(`INTRA_SLICE',          `0x0001')
+define(`MFC_AVC_PAK_LAST_MB',  `0x0400')
+
+define(`MFC_AVC_INTER_MASK_DW3',       `0x1F00FFFF')
+define(`MFC_AVC_INTRA_MASK_DW3',       `0x0000C0FF')
+define(`INTER_MV8',            `0x00400000')
+define(`INTER_MV32',           `0x00600000')
+define(`MFC_AVC_PAK_CBP',      `0x000E0000')
+define(`MFC_AVC_INTRA_FLAG',   `0x00002000')
+define(`AVC_INTRA_MASK',       `0x1F00')