gem_stress: render copy on gen3
authorDaniel Vetter <daniel.vetter@ffwll.ch>
Sun, 27 Mar 2011 19:33:29 +0000 (21:33 +0200)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Tue, 29 Mar 2011 20:52:50 +0000 (22:52 +0200)
Headers copied over from xf86-video-intel, code built after the Xrender
support.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
lib/i915_3d.h [new file with mode: 0644]
lib/i915_reg.h [new file with mode: 0644]
tests/gem_stress.c

diff --git a/lib/i915_3d.h b/lib/i915_3d.h
new file mode 100644 (file)
index 0000000..04531f3
--- /dev/null
@@ -0,0 +1,619 @@
+/* -*- c-basic-offset: 4 -*- */
+/*
+ * Copyright © 2006,2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+/* Each instruction is 3 dwords long, though most don't require all
+ * this space.  Maximum of 123 instructions.  Smaller maxes per insn
+ * type.
+ */
+#define _3DSTATE_PIXEL_SHADER_PROGRAM    (CMD_3D|(0x1d<<24)|(0x5<<16))
+
+#define REG_TYPE_R                 0 /* temporary regs, no need to
+                                     * dcl, must be written before
+                                     * read -- Preserved between
+                                     * phases.
+                                     */
+#define REG_TYPE_T                 1 /* Interpolated values, must be
+                                     * dcl'ed before use.
+                                     *
+                                     * 0..7: texture coord,
+                                     * 8: diffuse spec,
+                                     * 9: specular color,
+                                     * 10: fog parameter in w.
+                                     */
+#define REG_TYPE_CONST             2 /* Restriction: only one const
+                                     * can be referenced per
+                                     * instruction, though it may be
+                                     * selected for multiple inputs.
+                                     * Constants not initialized
+                                     * default to zero.
+                                     */
+#define REG_TYPE_S                 3 /* sampler */
+#define REG_TYPE_OC                4 /* output color (rgba) */
+#define REG_TYPE_OD                5 /* output depth (w), xyz are
+                                     * temporaries.  If not written,
+                                     * interpolated depth is used?
+                                     */
+#define REG_TYPE_U                 6 /* unpreserved temporaries */
+#define REG_TYPE_MASK              0x7
+#define REG_TYPE_SHIFT            4
+#define REG_NR_MASK                0xf
+
+/* REG_TYPE_T:
+*/
+#define T_TEX0     0
+#define T_TEX1     1
+#define T_TEX2     2
+#define T_TEX3     3
+#define T_TEX4     4
+#define T_TEX5     5
+#define T_TEX6     6
+#define T_TEX7     7
+#define T_DIFFUSE  8
+#define T_SPECULAR 9
+#define T_FOG_W    10          /* interpolated fog is in W coord */
+
+/* Arithmetic instructions */
+
+/* .replicate_swizzle == selection and replication of a particular
+ * scalar channel, ie., .xxxx, .yyyy, .zzzz or .wwww
+ */
+#define A0_NOP    (0x0<<24)            /* no operation */
+#define A0_ADD    (0x1<<24)            /* dst = src0 + src1 */
+#define A0_MOV    (0x2<<24)            /* dst = src0 */
+#define A0_MUL    (0x3<<24)            /* dst = src0 * src1 */
+#define A0_MAD    (0x4<<24)            /* dst = src0 * src1 + src2 */
+#define A0_DP2ADD (0x5<<24)            /* dst.xyzw = src0.xy dot src1.xy + src2.replicate_swizzle */
+#define A0_DP3    (0x6<<24)            /* dst.xyzw = src0.xyz dot src1.xyz */
+#define A0_DP4    (0x7<<24)            /* dst.xyzw = src0.xyzw dot src1.xyzw */
+#define A0_FRC    (0x8<<24)            /* dst = src0 - floor(src0) */
+#define A0_RCP    (0x9<<24)            /* dst.xyzw = 1/(src0.replicate_swizzle) */
+#define A0_RSQ    (0xa<<24)            /* dst.xyzw = 1/(sqrt(abs(src0.replicate_swizzle))) */
+#define A0_EXP    (0xb<<24)            /* dst.xyzw = exp2(src0.replicate_swizzle) */
+#define A0_LOG    (0xc<<24)            /* dst.xyzw = log2(abs(src0.replicate_swizzle)) */
+#define A0_CMP    (0xd<<24)            /* dst = (src0 >= 0.0) ? src1 : src2 */
+#define A0_MIN    (0xe<<24)            /* dst = (src0 < src1) ? src0 : src1 */
+#define A0_MAX    (0xf<<24)            /* dst = (src0 >= src1) ? src0 : src1 */
+#define A0_FLR    (0x10<<24)           /* dst = floor(src0) */
+#define A0_MOD    (0x11<<24)           /* dst = src0 fmod 1.0 */
+#define A0_TRC    (0x12<<24)           /* dst = int(src0) */
+#define A0_SGE    (0x13<<24)           /* dst = src0 >= src1 ? 1.0 : 0.0 */
+#define A0_SLT    (0x14<<24)           /* dst = src0 < src1 ? 1.0 : 0.0 */
+#define A0_DEST_SATURATE                 (1<<22)
+#define A0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+#define A0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define A0_DEST_CHANNEL_X                (1<<10)
+#define A0_DEST_CHANNEL_Y                (2<<10)
+#define A0_DEST_CHANNEL_Z                (4<<10)
+#define A0_DEST_CHANNEL_W                (8<<10)
+#define A0_DEST_CHANNEL_ALL              (0xf<<10)
+#define A0_DEST_CHANNEL_SHIFT            10
+#define A0_SRC0_TYPE_SHIFT               7
+#define A0_SRC0_NR_SHIFT                 2
+
+#define A0_DEST_CHANNEL_XY              (A0_DEST_CHANNEL_X|A0_DEST_CHANNEL_Y)
+#define A0_DEST_CHANNEL_XYZ             (A0_DEST_CHANNEL_XY|A0_DEST_CHANNEL_Z)
+
+#define SRC_X        0
+#define SRC_Y        1
+#define SRC_Z        2
+#define SRC_W        3
+#define SRC_ZERO     4
+#define SRC_ONE      5
+
+#define A1_SRC0_CHANNEL_X_NEGATE         (1<<31)
+#define A1_SRC0_CHANNEL_X_SHIFT          28
+#define A1_SRC0_CHANNEL_Y_NEGATE         (1<<27)
+#define A1_SRC0_CHANNEL_Y_SHIFT          24
+#define A1_SRC0_CHANNEL_Z_NEGATE         (1<<23)
+#define A1_SRC0_CHANNEL_Z_SHIFT          20
+#define A1_SRC0_CHANNEL_W_NEGATE         (1<<19)
+#define A1_SRC0_CHANNEL_W_SHIFT          16
+#define A1_SRC1_TYPE_SHIFT               13
+#define A1_SRC1_NR_SHIFT                 8
+#define A1_SRC1_CHANNEL_X_NEGATE         (1<<7)
+#define A1_SRC1_CHANNEL_X_SHIFT          4
+#define A1_SRC1_CHANNEL_Y_NEGATE         (1<<3)
+#define A1_SRC1_CHANNEL_Y_SHIFT          0
+
+#define A2_SRC1_CHANNEL_Z_NEGATE         (1<<31)
+#define A2_SRC1_CHANNEL_Z_SHIFT          28
+#define A2_SRC1_CHANNEL_W_NEGATE         (1<<27)
+#define A2_SRC1_CHANNEL_W_SHIFT          24
+#define A2_SRC2_TYPE_SHIFT               21
+#define A2_SRC2_NR_SHIFT                 16
+#define A2_SRC2_CHANNEL_X_NEGATE         (1<<15)
+#define A2_SRC2_CHANNEL_X_SHIFT          12
+#define A2_SRC2_CHANNEL_Y_NEGATE         (1<<11)
+#define A2_SRC2_CHANNEL_Y_SHIFT          8
+#define A2_SRC2_CHANNEL_Z_NEGATE         (1<<7)
+#define A2_SRC2_CHANNEL_Z_SHIFT          4
+#define A2_SRC2_CHANNEL_W_NEGATE         (1<<3)
+#define A2_SRC2_CHANNEL_W_SHIFT          0
+
+/* Texture instructions */
+#define T0_TEXLD     (0x15<<24)        /* Sample texture using predeclared
+                                * sampler and address, and output
+                                * filtered texel data to destination
+                                * register */
+#define T0_TEXLDP    (0x16<<24)        /* Same as texld but performs a
+                                * perspective divide of the texture
+                                * coordinate .xyz values by .w before
+                                * sampling. */
+#define T0_TEXLDB    (0x17<<24)        /* Same as texld but biases the
+                                * computed LOD by w.  Only S4.6 two's
+                                * comp is used.  This implies that a
+                                * float to fixed conversion is
+                                * done. */
+#define T0_TEXKILL   (0x18<<24)        /* Does not perform a sampling
+                                * operation.  Simply kills the pixel
+                                * if any channel of the address
+                                * register is < 0.0. */
+#define T0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+/* Note: U (unpreserved) regs do not retain their values between
+ * phases (cannot be used for feedback)
+ *
+ * Note: oC and OD registers can only be used as the destination of a
+ * texture instruction once per phase (this is an implementation
+ * restriction).
+ */
+#define T0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define T0_SAMPLER_NR_SHIFT              0 /* This field ignored for TEXKILL */
+#define T0_SAMPLER_NR_MASK               (0xf<<0)
+
+#define T1_ADDRESS_REG_TYPE_SHIFT        24 /* Reg to use as texture coord */
+/* Allow R, T, OC, OD -- R, OC, OD are 'dependent' reads, new program phase */
+#define T1_ADDRESS_REG_NR_SHIFT          17
+#define T2_MBZ                           0
+
+/* Declaration instructions */
+#define D0_DCL       (0x19<<24)        /* Declare a t (interpolated attrib)
+                                * register or an s (sampler)
+                                * register. */
+#define D0_SAMPLE_TYPE_SHIFT              22
+#define D0_SAMPLE_TYPE_2D                 (0x0<<22)
+#define D0_SAMPLE_TYPE_CUBE               (0x1<<22)
+#define D0_SAMPLE_TYPE_VOLUME             (0x2<<22)
+#define D0_SAMPLE_TYPE_MASK               (0x3<<22)
+
+#define D0_TYPE_SHIFT                19
+/* Allow: T, S */
+#define D0_NR_SHIFT                  14
+/* Allow T: 0..10, S: 0..15 */
+#define D0_CHANNEL_X                (1<<10)
+#define D0_CHANNEL_Y                (2<<10)
+#define D0_CHANNEL_Z                (4<<10)
+#define D0_CHANNEL_W                (8<<10)
+#define D0_CHANNEL_ALL              (0xf<<10)
+#define D0_CHANNEL_NONE             (0<<10)
+
+#define D0_CHANNEL_XY               (D0_CHANNEL_X|D0_CHANNEL_Y)
+#define D0_CHANNEL_XYZ              (D0_CHANNEL_XY|D0_CHANNEL_Z)
+
+/* I915 Errata: Do not allow (xz), (xw), (xzw) combinations for diffuse
+ * or specular declarations.
+ *
+ * For T dcls, only allow: (x), (xy), (xyz), (w), (xyzw)
+ *
+ * Must be zero for S (sampler) dcls
+ */
+#define D1_MBZ                          0
+#define D2_MBZ                          0
+
+
+/* MASK_* are the unshifted bitmasks of the destination mask in arithmetic
+ * operations
+ */
+#define MASK_X                 0x1
+#define MASK_Y                 0x2
+#define MASK_Z                 0x4
+#define MASK_W                 0x8
+#define MASK_XYZ               (MASK_X | MASK_Y | MASK_Z)
+#define MASK_XYZW              (MASK_XYZ | MASK_W)
+#define MASK_SATURATE          0x10
+
+/* Temporary, undeclared regs. Preserved between phases */
+#define FS_R0                  ((REG_TYPE_R << REG_TYPE_SHIFT) | 0)
+#define FS_R1                  ((REG_TYPE_R << REG_TYPE_SHIFT) | 1)
+#define FS_R2                  ((REG_TYPE_R << REG_TYPE_SHIFT) | 2)
+#define FS_R3                  ((REG_TYPE_R << REG_TYPE_SHIFT) | 3)
+
+/* Texture coordinate regs.  Must be declared. */
+#define FS_T0                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 0)
+#define FS_T1                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 1)
+#define FS_T2                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 2)
+#define FS_T3                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 3)
+#define FS_T4                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 4)
+#define FS_T5                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 5)
+#define FS_T6                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 6)
+#define FS_T7                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 7)
+#define FS_T8                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 8)
+#define FS_T9                  ((REG_TYPE_T << REG_TYPE_SHIFT) | 9)
+#define FS_T10                 ((REG_TYPE_T << REG_TYPE_SHIFT) | 10)
+
+/* Constant values */
+#define FS_C0                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 0)
+#define FS_C1                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 1)
+#define FS_C2                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 2)
+#define FS_C3                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 3)
+#define FS_C4                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 4)
+#define FS_C5                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 5)
+#define FS_C6                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 6)
+#define FS_C7                  ((REG_TYPE_CONST << REG_TYPE_SHIFT) | 7)
+
+/* Sampler regs */
+#define FS_S0                  ((REG_TYPE_S << REG_TYPE_SHIFT) | 0)
+#define FS_S1                  ((REG_TYPE_S << REG_TYPE_SHIFT) | 1)
+#define FS_S2                  ((REG_TYPE_S << REG_TYPE_SHIFT) | 2)
+#define FS_S3                  ((REG_TYPE_S << REG_TYPE_SHIFT) | 3)
+
+/* Output color */
+#define FS_OC                  ((REG_TYPE_OC << REG_TYPE_SHIFT) | 0)
+
+/* Output depth */
+#define FS_OD                  ((REG_TYPE_OD << REG_TYPE_SHIFT) | 0)
+
+/* Unpreserved temporary regs */
+#define FS_U0                  ((REG_TYPE_U << REG_TYPE_SHIFT) | 0)
+#define FS_U1                  ((REG_TYPE_U << REG_TYPE_SHIFT) | 1)
+#define FS_U2                  ((REG_TYPE_U << REG_TYPE_SHIFT) | 2)
+#define FS_U3                  ((REG_TYPE_U << REG_TYPE_SHIFT) | 3)
+
+#define X_CHANNEL_SHIFT (REG_TYPE_SHIFT + 3)
+#define Y_CHANNEL_SHIFT (X_CHANNEL_SHIFT + 4)
+#define Z_CHANNEL_SHIFT (Y_CHANNEL_SHIFT + 4)
+#define W_CHANNEL_SHIFT (Z_CHANNEL_SHIFT + 4)
+
+#define REG_CHANNEL_MASK 0xf
+
+#define REG_NR(reg)            ((reg) & REG_NR_MASK)
+#define REG_TYPE(reg)          (((reg) >> REG_TYPE_SHIFT) & REG_TYPE_MASK)
+#define REG_X(reg)             (((reg) >> X_CHANNEL_SHIFT) & REG_CHANNEL_MASK)
+#define REG_Y(reg)             (((reg) >> Y_CHANNEL_SHIFT) & REG_CHANNEL_MASK)
+#define REG_Z(reg)             (((reg) >> Z_CHANNEL_SHIFT) & REG_CHANNEL_MASK)
+#define REG_W(reg)             (((reg) >> W_CHANNEL_SHIFT) & REG_CHANNEL_MASK)
+
+enum i915_fs_channel {
+       X_CHANNEL_VAL = 0,
+       Y_CHANNEL_VAL,
+       Z_CHANNEL_VAL,
+       W_CHANNEL_VAL,
+       ZERO_CHANNEL_VAL,
+       ONE_CHANNEL_VAL,
+
+       NEG_X_CHANNEL_VAL = X_CHANNEL_VAL | 0x8,
+       NEG_Y_CHANNEL_VAL = Y_CHANNEL_VAL | 0x8,
+       NEG_Z_CHANNEL_VAL = Z_CHANNEL_VAL | 0x8,
+       NEG_W_CHANNEL_VAL = W_CHANNEL_VAL | 0x8,
+       NEG_ONE_CHANNEL_VAL = ONE_CHANNEL_VAL | 0x8
+};
+
+#define i915_fs_operand(reg, x, y, z, w) \
+       (reg) | \
+(x##_CHANNEL_VAL << X_CHANNEL_SHIFT) | \
+(y##_CHANNEL_VAL << Y_CHANNEL_SHIFT) | \
+(z##_CHANNEL_VAL << Z_CHANNEL_SHIFT) | \
+(w##_CHANNEL_VAL << W_CHANNEL_SHIFT)
+
+/**
+ * Construct an operand description for using a register with no swizzling
+ */
+#define i915_fs_operand_reg(reg)                                       \
+       i915_fs_operand(reg, X, Y, Z, W)
+
+#define i915_fs_operand_reg_negate(reg)                                        \
+       i915_fs_operand(reg, NEG_X, NEG_Y, NEG_Z, NEG_W)
+
+/**
+ * Returns an operand containing (0.0, 0.0, 0.0, 0.0).
+ */
+#define i915_fs_operand_zero() i915_fs_operand(FS_R0, ZERO, ZERO, ZERO, ZERO)
+
+/**
+ * Returns an unused operand
+ */
+#define i915_fs_operand_none() i915_fs_operand_zero()
+
+/**
+ * Returns an operand containing (1.0, 1.0, 1.0, 1.0).
+ */
+#define i915_fs_operand_one() i915_fs_operand(FS_R0, ONE, ONE, ONE, ONE)
+
+#define i915_get_hardware_channel_val(val, shift, negate) \
+       (((val & 0x7) << shift) | ((val & 0x8) ? negate : 0))
+
+/**
+ * Outputs a fragment shader command to declare a sampler or texture register.
+ */
+#define i915_fs_dcl(reg)                                               \
+       do {                                                                    \
+               OUT_BATCH(D0_DCL | \
+                         (REG_TYPE(reg) << D0_TYPE_SHIFT) | \
+                         (REG_NR(reg) << D0_NR_SHIFT) | \
+                         ((REG_TYPE(reg) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0)); \
+               OUT_BATCH(0); \
+               OUT_BATCH(0); \
+       } while (0)
+
+#define i915_fs_texld(dest_reg, sampler_reg, address_reg)              \
+       do {                                                                    \
+               OUT_BATCH(T0_TEXLD | \
+                         (REG_TYPE(dest_reg) << T0_DEST_TYPE_SHIFT) | \
+                         (REG_NR(dest_reg) << T0_DEST_NR_SHIFT) | \
+                         (REG_NR(sampler_reg) << T0_SAMPLER_NR_SHIFT)); \
+               OUT_BATCH((REG_TYPE(address_reg) << T1_ADDRESS_REG_TYPE_SHIFT) | \
+                         (REG_NR(address_reg) << T1_ADDRESS_REG_NR_SHIFT)); \
+               OUT_BATCH(0); \
+       } while (0)
+
+#define i915_fs_texldp(dest_reg, sampler_reg, address_reg)             \
+       do {                                                                    \
+               OUT_BATCH(T0_TEXLDP | \
+                         (REG_TYPE(dest_reg) << T0_DEST_TYPE_SHIFT) | \
+                         (REG_NR(dest_reg) << T0_DEST_NR_SHIFT) | \
+                         (REG_NR(sampler_reg) << T0_SAMPLER_NR_SHIFT)); \
+               OUT_BATCH((REG_TYPE(address_reg) << T1_ADDRESS_REG_TYPE_SHIFT) | \
+                         (REG_NR(address_reg) << T1_ADDRESS_REG_NR_SHIFT)); \
+               OUT_BATCH(0); \
+       } while (0)
+
+#define i915_fs_arith_masked(op, dest_reg, dest_mask, operand0, operand1, operand2)    \
+       _i915_fs_arith_masked(A0_##op, dest_reg, dest_mask, operand0, operand1, operand2)
+
+#define i915_fs_arith(op, dest_reg, operand0, operand1, operand2)      \
+       _i915_fs_arith(A0_##op, dest_reg, operand0, operand1, operand2)
+
+#define _i915_fs_arith_masked(cmd, dest_reg, dest_mask, operand0, operand1, operand2) \
+       do { \
+               /* Set up destination register and write mask */ \
+               OUT_BATCH(cmd | \
+                         (REG_TYPE(dest_reg) << A0_DEST_TYPE_SHIFT) | \
+                         (REG_NR(dest_reg) << A0_DEST_NR_SHIFT) | \
+                         (((dest_mask) & ~MASK_SATURATE) << A0_DEST_CHANNEL_SHIFT) | \
+                         (((dest_mask) & MASK_SATURATE) ? A0_DEST_SATURATE : 0) | \
+                         /* Set up operand 0 */ \
+                         (REG_TYPE(operand0) << A0_SRC0_TYPE_SHIFT) | \
+                         (REG_NR(operand0) << A0_SRC0_NR_SHIFT)); \
+               OUT_BATCH(i915_get_hardware_channel_val(REG_X(operand0), \
+                                                       A1_SRC0_CHANNEL_X_SHIFT, \
+                                                       A1_SRC0_CHANNEL_X_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_Y(operand0), \
+                                                       A1_SRC0_CHANNEL_Y_SHIFT, \
+                                                       A1_SRC0_CHANNEL_Y_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_Z(operand0), \
+                                                       A1_SRC0_CHANNEL_Z_SHIFT, \
+                                                       A1_SRC0_CHANNEL_Z_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_W(operand0), \
+                                                       A1_SRC0_CHANNEL_W_SHIFT, \
+                                                       A1_SRC0_CHANNEL_W_NEGATE) | \
+                         /* Set up operand 1 */ \
+                         (REG_TYPE(operand1) << A1_SRC1_TYPE_SHIFT) | \
+                         (REG_NR(operand1) << A1_SRC1_NR_SHIFT) | \
+                         i915_get_hardware_channel_val(REG_X(operand1), \
+                                                       A1_SRC1_CHANNEL_X_SHIFT, \
+                                                       A1_SRC1_CHANNEL_X_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_Y(operand1), \
+                                                       A1_SRC1_CHANNEL_Y_SHIFT, \
+                                                       A1_SRC1_CHANNEL_Y_NEGATE)); \
+               OUT_BATCH(i915_get_hardware_channel_val(REG_Z(operand1), \
+                                                       A2_SRC1_CHANNEL_Z_SHIFT, \
+                                                       A2_SRC1_CHANNEL_Z_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_W(operand1), \
+                                                       A2_SRC1_CHANNEL_W_SHIFT, \
+                                                       A2_SRC1_CHANNEL_W_NEGATE) | \
+                         /* Set up operand 2 */ \
+                         (REG_TYPE(operand2) << A2_SRC2_TYPE_SHIFT) | \
+                         (REG_NR(operand2) << A2_SRC2_NR_SHIFT) | \
+                         i915_get_hardware_channel_val(REG_X(operand2), \
+                                                       A2_SRC2_CHANNEL_X_SHIFT, \
+                                                       A2_SRC2_CHANNEL_X_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_Y(operand2), \
+                                                       A2_SRC2_CHANNEL_Y_SHIFT, \
+                                                       A2_SRC2_CHANNEL_Y_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_Z(operand2), \
+                                                       A2_SRC2_CHANNEL_Z_SHIFT, \
+                                                       A2_SRC2_CHANNEL_Z_NEGATE) | \
+                         i915_get_hardware_channel_val(REG_W(operand2), \
+                                                       A2_SRC2_CHANNEL_W_SHIFT, \
+                                                       A2_SRC2_CHANNEL_W_NEGATE)); \
+       } while (0)
+
+#define _i915_fs_arith(cmd, dest_reg, operand0, operand1, operand2) do {\
+       /* Set up destination register and write mask */ \
+       OUT_BATCH(cmd | \
+                 (REG_TYPE(dest_reg) << A0_DEST_TYPE_SHIFT) | \
+                 (REG_NR(dest_reg) << A0_DEST_NR_SHIFT) | \
+                 (A0_DEST_CHANNEL_ALL) | \
+                 /* Set up operand 0 */ \
+                 (REG_TYPE(operand0) << A0_SRC0_TYPE_SHIFT) | \
+                 (REG_NR(operand0) << A0_SRC0_NR_SHIFT)); \
+       OUT_BATCH(i915_get_hardware_channel_val(REG_X(operand0), \
+                                               A1_SRC0_CHANNEL_X_SHIFT, \
+                                               A1_SRC0_CHANNEL_X_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_Y(operand0), \
+                                               A1_SRC0_CHANNEL_Y_SHIFT, \
+                                               A1_SRC0_CHANNEL_Y_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_Z(operand0), \
+                                               A1_SRC0_CHANNEL_Z_SHIFT, \
+                                               A1_SRC0_CHANNEL_Z_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_W(operand0), \
+                                               A1_SRC0_CHANNEL_W_SHIFT, \
+                                               A1_SRC0_CHANNEL_W_NEGATE) | \
+                 /* Set up operand 1 */ \
+                 (REG_TYPE(operand1) << A1_SRC1_TYPE_SHIFT) | \
+                 (REG_NR(operand1) << A1_SRC1_NR_SHIFT) | \
+                 i915_get_hardware_channel_val(REG_X(operand1), \
+                                               A1_SRC1_CHANNEL_X_SHIFT, \
+                                               A1_SRC1_CHANNEL_X_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_Y(operand1), \
+                                               A1_SRC1_CHANNEL_Y_SHIFT, \
+                                               A1_SRC1_CHANNEL_Y_NEGATE)); \
+       OUT_BATCH(i915_get_hardware_channel_val(REG_Z(operand1), \
+                                               A2_SRC1_CHANNEL_Z_SHIFT, \
+                                               A2_SRC1_CHANNEL_Z_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_W(operand1), \
+                                               A2_SRC1_CHANNEL_W_SHIFT, \
+                                               A2_SRC1_CHANNEL_W_NEGATE) | \
+                 /* Set up operand 2 */ \
+                 (REG_TYPE(operand2) << A2_SRC2_TYPE_SHIFT) | \
+                 (REG_NR(operand2) << A2_SRC2_NR_SHIFT) | \
+                 i915_get_hardware_channel_val(REG_X(operand2), \
+                                               A2_SRC2_CHANNEL_X_SHIFT, \
+                                               A2_SRC2_CHANNEL_X_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_Y(operand2), \
+                                               A2_SRC2_CHANNEL_Y_SHIFT, \
+                                               A2_SRC2_CHANNEL_Y_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_Z(operand2), \
+                                               A2_SRC2_CHANNEL_Z_SHIFT, \
+                                               A2_SRC2_CHANNEL_Z_NEGATE) | \
+                 i915_get_hardware_channel_val(REG_W(operand2), \
+                                               A2_SRC2_CHANNEL_W_SHIFT, \
+                                               A2_SRC2_CHANNEL_W_NEGATE)); \
+} while (0)
+
+#define i915_fs_mov(dest_reg, operand0)                                        \
+       i915_fs_arith(MOV, dest_reg, \
+                     operand0,                 \
+                     i915_fs_operand_none(),                   \
+                     i915_fs_operand_none())
+
+#define i915_fs_mov_masked(dest_reg, dest_mask, operand0)              \
+       i915_fs_arith_masked (MOV, dest_reg, dest_mask, \
+                             operand0, \
+                             i915_fs_operand_none(), \
+                             i915_fs_operand_none())
+
+
+#define i915_fs_frc(dest_reg, operand0)                                        \
+       i915_fs_arith (FRC, dest_reg, \
+                      operand0,                        \
+                      i915_fs_operand_none(),                  \
+                      i915_fs_operand_none())
+
+/** Add operand0 and operand1 and put the result in dest_reg */
+#define i915_fs_add(dest_reg, operand0, operand1)                      \
+       i915_fs_arith (ADD, dest_reg, \
+                      operand0, operand1,      \
+                      i915_fs_operand_none())
+
+/** Multiply operand0 and operand1 and put the result in dest_reg */
+#define i915_fs_mul(dest_reg, operand0, operand1)                      \
+       i915_fs_arith (MUL, dest_reg, \
+                      operand0, operand1,      \
+                      i915_fs_operand_none())
+
+/** Computes 1/sqrt(operand0.replicate_swizzle) puts the result in dest_reg */
+#define i915_fs_rsq(dest_reg, dest_mask, operand0)             \
+       do {                                                                    \
+               if (dest_mask) {                                                        \
+                       i915_fs_arith_masked (RSQ, dest_reg, dest_mask, \
+                                             operand0,                 \
+                                             i915_fs_operand_none (),                  \
+                                             i915_fs_operand_none ());                 \
+               } else { \
+                       i915_fs_arith (RSQ, dest_reg, \
+                                      operand0, \
+                                      i915_fs_operand_none (), \
+                                      i915_fs_operand_none ()); \
+               } \
+       } while (0)
+
+/** Puts the minimum of operand0 and operand1 in dest_reg */
+#define i915_fs_min(dest_reg, operand0, operand1)                      \
+       i915_fs_arith (MIN, dest_reg, \
+                      operand0, operand1, \
+                      i915_fs_operand_none())
+
+/** Puts the maximum of operand0 and operand1 in dest_reg */
+#define i915_fs_max(dest_reg, operand0, operand1)                      \
+       i915_fs_arith (MAX, dest_reg, \
+                      operand0, operand1, \
+                      i915_fs_operand_none())
+
+#define i915_fs_cmp(dest_reg, operand0, operand1, operand2)            \
+       i915_fs_arith (CMP, dest_reg, operand0, operand1, operand2)
+
+/** Perform operand0 * operand1 + operand2 and put the result in dest_reg */
+#define i915_fs_mad(dest_reg, dest_mask, op0, op1, op2)        \
+       do {                                                                    \
+               if (dest_mask) {                                                        \
+                       i915_fs_arith_masked (MAD, dest_reg, dest_mask, op0, op1, op2); \
+               } else { \
+                       i915_fs_arith (MAD, dest_reg, op0, op1, op2); \
+               } \
+       } while (0)
+
+#define i915_fs_dp2add(dest_reg, dest_mask, op0, op1, op2)     \
+       do {                                                                    \
+               if (dest_mask) {                                                        \
+                       i915_fs_arith_masked (DP2ADD, dest_reg, dest_mask, op0, op1, op2); \
+               } else { \
+                       i915_fs_arith (DP2ADD, dest_reg, op0, op1, op2); \
+               } \
+       } while (0)
+
+/**
+ * Perform a 3-component dot-product of operand0 and operand1 and put the
+ * resulting scalar in the channels of dest_reg specified by the dest_mask.
+ */
+#define i915_fs_dp3(dest_reg, dest_mask, op0, op1)     \
+       do {                                                                    \
+               if (dest_mask) {                                                        \
+                       i915_fs_arith_masked (DP3, dest_reg, dest_mask, \
+                                             op0, op1,\
+                                             i915_fs_operand_none());                  \
+               } else { \
+                       i915_fs_arith (DP3, dest_reg, op0, op1,\
+                                      i915_fs_operand_none());                 \
+               } \
+       } while (0)
+
+/**
+ * Sets up local state for accumulating a fragment shader buffer.
+ *
+ * \param x maximum number of shader commands that may be used between
+ *        a FS_START and FS_END
+ */
+#define FS_LOCALS()                                                    \
+       uint32_t _shader_offset
+
+#define FS_BEGIN()                                                     \
+       do {                                                                    \
+               _shader_offset = intel->batch_used++;                           \
+       } while (0)
+
+#define FS_END()                                                       \
+       do {                                                                    \
+               intel->batch_ptr[_shader_offset] =                                      \
+               _3DSTATE_PIXEL_SHADER_PROGRAM |                                 \
+               (intel->batch_used - _shader_offset - 2);                       \
+       } while (0);
diff --git a/lib/i915_reg.h b/lib/i915_reg.h
new file mode 100644 (file)
index 0000000..746a413
--- /dev/null
@@ -0,0 +1,844 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef _I915_REG_H_
+#define _I915_REG_H_
+
+#define I915_SET_FIELD( var, mask, value ) (var &= ~(mask), var |= value)
+
+#define CMD_3D (0x3<<29)
+
+#define PRIM3D                 (CMD_3D | (0x1f<<24))
+#define PRIM3D_INDIRECT_SEQUENTIAL      ((1<<23) | (0<<17))
+#define PRIM3D_TRILIST         (PRIM3D | (0x0<<18))
+#define PRIM3D_TRISTRIP        (PRIM3D | (0x1<<18))
+#define PRIM3D_TRISTRIP_RVRSE  (PRIM3D | (0x2<<18))
+#define PRIM3D_TRIFAN          (PRIM3D | (0x3<<18))
+#define PRIM3D_POLY            (PRIM3D | (0x4<<18))
+#define PRIM3D_LINELIST        (PRIM3D | (0x5<<18))
+#define PRIM3D_LINESTRIP       (PRIM3D | (0x6<<18))
+#define PRIM3D_RECTLIST        (PRIM3D | (0x7<<18))
+#define PRIM3D_POINTLIST       (PRIM3D | (0x8<<18))
+#define PRIM3D_DIB             (PRIM3D | (0x9<<18))
+#define PRIM3D_CLEAR_RECT      (PRIM3D | (0xa<<18))
+#define PRIM3D_ZONE_INIT       (PRIM3D | (0xd<<18))
+#define PRIM3D_MASK            (0x1f<<18)
+
+/* p137 */
+#define _3DSTATE_AA_CMD                        (CMD_3D | (0x06<<24))
+#define AA_LINE_ECAAR_WIDTH_ENABLE     (1<<16)
+#define AA_LINE_ECAAR_WIDTH_0_5        0
+#define AA_LINE_ECAAR_WIDTH_1_0                (1<<14)
+#define AA_LINE_ECAAR_WIDTH_2_0        (2<<14)
+#define AA_LINE_ECAAR_WIDTH_4_0        (3<<14)
+#define AA_LINE_REGION_WIDTH_ENABLE    (1<<8)
+#define AA_LINE_REGION_WIDTH_0_5       0
+#define AA_LINE_REGION_WIDTH_1_0       (1<<6)
+#define AA_LINE_REGION_WIDTH_2_0       (2<<6)
+#define AA_LINE_REGION_WIDTH_4_0       (3<<6)
+
+/* 3DSTATE_BACKFACE_STENCIL_OPS, p138*/
+#define _3DSTATE_BACKFACE_STENCIL_OPS    (CMD_3D | (0x8<<24))
+#define BFO_ENABLE_STENCIL_REF          (1<<23)
+#define BFO_STENCIL_REF_SHIFT           15
+#define BFO_STENCIL_REF_MASK            (0xff<<15)
+#define BFO_ENABLE_STENCIL_FUNCS        (1<<14)
+#define BFO_STENCIL_TEST_SHIFT          11
+#define BFO_STENCIL_TEST_MASK           (0x7<<11)
+#define BFO_STENCIL_FAIL_SHIFT          8
+#define BFO_STENCIL_FAIL_MASK           (0x7<<8)
+#define BFO_STENCIL_PASS_Z_FAIL_SHIFT   5
+#define BFO_STENCIL_PASS_Z_FAIL_MASK    (0x7<<5)
+#define BFO_STENCIL_PASS_Z_PASS_SHIFT   2
+#define BFO_STENCIL_PASS_Z_PASS_MASK    (0x7<<2)
+#define BFO_ENABLE_STENCIL_TWO_SIDE     (1<<1)
+#define BFO_STENCIL_TWO_SIDE            (1<<0)
+
+/* 3DSTATE_BACKFACE_STENCIL_MASKS, p140 */
+#define _3DSTATE_BACKFACE_STENCIL_MASKS    (CMD_3D | (0x9<<24))
+#define BFM_ENABLE_STENCIL_TEST_MASK      (1<<17)
+#define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
+#define BFM_STENCIL_TEST_MASK_SHIFT       8
+#define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_WRITE_MASK_SHIFT      0
+#define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+
+/* 3DSTATE_BIN_CONTROL p141 */
+
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD  (CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK   (0x3<<24)
+#define BUF_3D_ID_DEPTH        (0x7<<24)
+#define BUF_3D_USE_FENCE       (1<<23)
+#define BUF_3D_TILED_SURFACE   (1<<22)
+#define BUF_3D_TILE_WALK_X     0
+#define BUF_3D_TILE_WALK_Y     (1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)         ((x) & ~0x3)
+
+/* 3DSTATE_CHROMA_KEY */
+
+/* 3DSTATE_CLEAR_PARAMETERS, p150 */
+#define _3DSTATE_CLEAR_PARAMETERS   (CMD_3D | (0x1d<<24) | (0x9c<<16) | 5)
+/* Dword 1 */
+#define CLEARPARAM_CLEAR_RECT      (1 << 16)
+#define CLEARPARAM_ZONE_INIT       (0 << 16)
+#define CLEARPARAM_WRITE_COLOR     (1 << 2)
+#define CLEARPARAM_WRITE_DEPTH     (1 << 1)
+#define CLEARPARAM_WRITE_STENCIL    (1 << 0)
+
+/* 3DSTATE_CONSTANT_BLEND_COLOR, p153 */
+#define _3DSTATE_CONST_BLEND_COLOR_CMD (CMD_3D | (0x1d<<24) | (0x88<<16))
+
+/* 3DSTATE_COORD_SET_BINDINGS, p154 */
+#define _3DSTATE_COORD_SET_BINDINGS      (CMD_3D | (0x16<<24))
+#define CSB_TCB(iunit, eunit)           ((eunit)<<(iunit*3))
+
+/* p156 */
+#define _3DSTATE_DFLT_DIFFUSE_CMD      (CMD_3D | (0x1d<<24) | (0x99<<16))
+
+/* p157 */
+#define _3DSTATE_DFLT_SPEC_CMD         (CMD_3D | (0x1d<<24) | (0x9a<<16))
+
+/* p158 */
+#define _3DSTATE_DFLT_Z_CMD            (CMD_3D | (0x1d<<24) | (0x98<<16))
+
+/* 3DSTATE_DEPTH_OFFSET_SCALE, p159 */
+#define _3DSTATE_DEPTH_OFFSET_SCALE       (CMD_3D | (0x1d<<24) | (0x97<<16))
+/* scale in dword 1 */
+
+/* The depth subrectangle is not supported, but must be disabled. */
+/* 3DSTATE_DEPTH_SUBRECT_DISABLE, p160 */
+#define _3DSTATE_DEPTH_SUBRECT_DISABLE (CMD_3D | (0x1c<<24) | (0x11<<19) | (1 << 1) | (0 << 0))
+
+/* p161 */
+#define _3DSTATE_DST_BUF_VARS_CMD      (CMD_3D | (0x1d<<24) | (0x85<<16))
+/* Dword 1 */
+#define TEX_DEFAULT_COLOR_OGL           (0<<30)
+#define TEX_DEFAULT_COLOR_D3D           (1<<30)
+#define ZR_EARLY_DEPTH                  (1<<29)
+#define LOD_PRECLAMP_OGL                (1<<28)
+#define LOD_PRECLAMP_D3D                (0<<28)
+#define DITHER_FULL_ALWAYS              (0<<26)
+#define DITHER_FULL_ON_FB_BLEND         (1<<26)
+#define DITHER_CLAMPED_ALWAYS           (2<<26)
+#define LINEAR_GAMMA_BLEND_32BPP        (1<<25)
+#define DEBUG_DISABLE_ENH_DITHER        (1<<24)
+#define DSTORG_HORT_BIAS(x)            ((x)<<20)
+#define DSTORG_VERT_BIAS(x)            ((x)<<16)
+#define COLOR_4_2_2_CHNL_WRT_ALL       0
+#define COLOR_4_2_2_CHNL_WRT_Y         (1<<12)
+#define COLOR_4_2_2_CHNL_WRT_CR                (2<<12)
+#define COLOR_4_2_2_CHNL_WRT_CB                (3<<12)
+#define COLOR_4_2_2_CHNL_WRT_CRCB      (4<<12)
+#define COLR_BUF_8BIT                  0
+#define COLR_BUF_RGB555                (1<<8)
+#define COLR_BUF_RGB565                (2<<8)
+#define COLR_BUF_ARGB8888              (3<<8)
+#define COLR_BUF_ARGB4444              (8<<8)
+#define COLR_BUF_ARGB1555              (9<<8)
+#define COLR_BUF_ARGB2AAA              (0xa<<8)
+#define DEPTH_FRMT_16_FIXED            0
+#define DEPTH_FRMT_16_FLOAT            (1<<2)
+#define DEPTH_FRMT_24_FIXED_8_OTHER    (2<<2)
+#define VERT_LINE_STRIDE_1             (1<<1)
+#define VERT_LINE_STRIDE_0             (0<<1)
+#define VERT_LINE_STRIDE_OFS_1         1
+#define VERT_LINE_STRIDE_OFS_0         0
+
+/* p166 */
+#define _3DSTATE_DRAW_RECT_CMD         (CMD_3D|(0x1d<<24)|(0x80<<16)|3)
+/* Dword 1 */
+#define DRAW_RECT_DIS_DEPTH_OFS        (1<<30)
+#define DRAW_DITHER_OFS_X(x)           ((x)<<26)
+#define DRAW_DITHER_OFS_Y(x)           ((x)<<24)
+/* Dword 2 */
+#define DRAW_YMIN(x)                   ((x)<<16)
+#define DRAW_XMIN(x)                   (x)
+/* Dword 3 */
+#define DRAW_YMAX(x)                   ((x)<<16)
+#define DRAW_XMAX(x)                   (x)
+/* Dword 4 */
+#define DRAW_YORG(x)                   ((x)<<16)
+#define DRAW_XORG(x)                   (x)
+
+/* 3DSTATE_FILTER_COEFFICIENTS_4X4, p170 */
+
+/* 3DSTATE_FILTER_COEFFICIENTS_6X5, p172 */
+
+/* _3DSTATE_FOG_COLOR, p173 */
+#define _3DSTATE_FOG_COLOR_CMD         (CMD_3D|(0x15<<24))
+#define FOG_COLOR_RED(x)               ((x)<<16)
+#define FOG_COLOR_GREEN(x)             ((x)<<8)
+#define FOG_COLOR_BLUE(x)              (x)
+
+/* _3DSTATE_FOG_MODE, p174 */
+#define _3DSTATE_FOG_MODE_CMD          (CMD_3D|(0x1d<<24)|(0x89<<16)|2)
+/* Dword 1 */
+#define FMC1_FOGFUNC_MODIFY_ENABLE     (1<<31)
+#define FMC1_FOGFUNC_VERTEX            (0<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP         (1<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP2                (2<<28)
+#define FMC1_FOGFUNC_PIXEL_LINEAR      (3<<28)
+#define FMC1_FOGFUNC_MASK              (3<<28)
+#define FMC1_FOGINDEX_MODIFY_ENABLE     (1<<27)
+#define FMC1_FOGINDEX_Z                        (0<<25)
+#define FMC1_FOGINDEX_W                (1<<25)
+#define FMC1_C1_C2_MODIFY_ENABLE       (1<<24)
+#define FMC1_DENSITY_MODIFY_ENABLE     (1<<23)
+#define FMC1_C1_ONE                    (1<<13)
+#define FMC1_C1_MASK                   (0xffff<<4)
+/* Dword 2 */
+#define FMC2_C2_ONE                    (1<<16)
+/* Dword 3 */
+#define FMC3_D_ONE                     (1<<16)
+
+/* _3DSTATE_INDEPENDENT_ALPHA_BLEND, p177 */
+#define _3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD   (CMD_3D|(0x0b<<24))
+#define IAB_MODIFY_ENABLE              (1<<23)
+#define IAB_ENABLE                     (1<<22)
+#define IAB_MODIFY_FUNC                (1<<21)
+#define IAB_FUNC_SHIFT                 16
+#define IAB_MODIFY_SRC_FACTOR          (1<<11)
+#define IAB_SRC_FACTOR_SHIFT           6
+#define IAB_SRC_FACTOR_MASK            (BLENDFACT_MASK<<6)
+#define IAB_MODIFY_DST_FACTOR          (1<<5)
+#define IAB_DST_FACTOR_SHIFT           0
+#define IAB_DST_FACTOR_MASK            (BLENDFACT_MASK<<0)
+
+#define BLENDFACT_ZERO                 0x01
+#define BLENDFACT_ONE                  0x02
+#define BLENDFACT_SRC_COLR             0x03
+#define BLENDFACT_INV_SRC_COLR                 0x04
+#define BLENDFACT_SRC_ALPHA            0x05
+#define BLENDFACT_INV_SRC_ALPHA        0x06
+#define BLENDFACT_DST_ALPHA            0x07
+#define BLENDFACT_INV_DST_ALPHA        0x08
+#define BLENDFACT_DST_COLR             0x09
+#define BLENDFACT_INV_DST_COLR         0x0a
+#define BLENDFACT_SRC_ALPHA_SATURATE   0x0b
+#define BLENDFACT_CONST_COLOR          0x0c
+#define BLENDFACT_INV_CONST_COLOR      0x0d
+#define BLENDFACT_CONST_ALPHA          0x0e
+#define BLENDFACT_INV_CONST_ALPHA      0x0f
+#define BLENDFACT_MASK                 0x0f
+
+#define BLENDFUNC_ADD                  0x0
+#define BLENDFUNC_SUBTRACT             0x1
+#define BLENDFUNC_REVERSE_SUBTRACT     0x2
+#define BLENDFUNC_MIN                  0x3
+#define BLENDFUNC_MAX                  0x4
+#define BLENDFUNC_MASK                 0x7
+
+/* 3DSTATE_LOAD_INDIRECT, p180 */
+
+#define _3DSTATE_LOAD_INDIRECT         (CMD_3D|(0x1d<<24)|(0x7<<16))
+#define LI0_STATE_STATIC_INDIRECT       (0x01<<8)
+#define LI0_STATE_DYNAMIC_INDIRECT      (0x02<<8)
+#define LI0_STATE_SAMPLER               (0x04<<8)
+#define LI0_STATE_MAP                   (0x08<<8)
+#define LI0_STATE_PROGRAM               (0x10<<8)
+#define LI0_STATE_CONSTANTS             (0x20<<8)
+
+#define SIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SIS0_FORCE_LOAD                 (1<<1)
+#define SIS0_BUFFER_VALID               (1<<0)
+#define SIS1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define DIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define DIS0_BUFFER_RESET               (1<<1)
+#define DIS0_BUFFER_VALID               (1<<0)
+
+#define SSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SSB0_FORCE_LOAD                 (1<<1)
+#define SSB0_BUFFER_VALID               (1<<0)
+#define SSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define MSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define MSB0_FORCE_LOAD                 (1<<1)
+#define MSB0_BUFFER_VALID               (1<<0)
+#define MSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSP0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSP0_FORCE_LOAD                 (1<<1)
+#define PSP0_BUFFER_VALID               (1<<0)
+#define PSP1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSC0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSC0_FORCE_LOAD                 (1<<1)
+#define PSC0_BUFFER_VALID               (1<<0)
+#define PSC1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+/* _3DSTATE_RASTERIZATION_RULES */
+#define _3DSTATE_RASTER_RULES_CMD      (CMD_3D|(0x07<<24))
+#define ENABLE_POINT_RASTER_RULE       (1<<15)
+#define OGL_POINT_RASTER_RULE          (1<<13)
+#define ENABLE_TEXKILL_3D_4D            (1<<10)
+#define TEXKILL_3D                      (0<<9)
+#define TEXKILL_4D                      (1<<9)
+#define ENABLE_LINE_STRIP_PROVOKE_VRTX (1<<8)
+#define ENABLE_TRI_FAN_PROVOKE_VRTX    (1<<5)
+#define LINE_STRIP_PROVOKE_VRTX(x)     ((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX(x)        ((x)<<3)
+
+/* _3DSTATE_SCISSOR_ENABLE, p256 */
+#define _3DSTATE_SCISSOR_ENABLE_CMD    (CMD_3D|(0x1c<<24)|(0x10<<19))
+#define ENABLE_SCISSOR_RECT            ((1<<1) | 1)
+#define DISABLE_SCISSOR_RECT           (1<<1)
+
+/* _3DSTATE_SCISSOR_RECTANGLE_0, p257 */
+#define _3DSTATE_SCISSOR_RECT_0_CMD    (CMD_3D|(0x1d<<24)|(0x81<<16)|1)
+/* Dword 1 */
+#define SCISSOR_RECT_0_YMIN(x)         ((x)<<16)
+#define SCISSOR_RECT_0_XMIN(x)         (x)
+/* Dword 2 */
+#define SCISSOR_RECT_0_YMAX(x)         ((x)<<16)
+#define SCISSOR_RECT_0_XMAX(x)         (x)
+
+/* p189 */
+#define _3DSTATE_LOAD_STATE_IMMEDIATE_1   ((0x3<<29)|(0x1d<<24)|(0x04<<16))
+#define I1_LOAD_S(n)                      (1<<(4+n))
+
+#define S0_VB_OFFSET_MASK              0xffffffc
+#define S0_AUTO_CACHE_INV_DISABLE      (1<<0)
+
+#define S1_VERTEX_WIDTH_SHIFT          24
+#define S1_VERTEX_WIDTH_MASK           (0x3f<<24)
+#define S1_VERTEX_PITCH_SHIFT          16
+#define S1_VERTEX_PITCH_MASK           (0x3f<<16)
+
+#define TEXCOORDFMT_2D                 0x0
+#define TEXCOORDFMT_3D                 0x1
+#define TEXCOORDFMT_4D                 0x2
+#define TEXCOORDFMT_1D                 0x3
+#define TEXCOORDFMT_2D_16              0x4
+#define TEXCOORDFMT_4D_16              0x5
+#define TEXCOORDFMT_NOT_PRESENT        0xf
+#define S2_TEXCOORD_FMT0_MASK            0xf
+#define S2_TEXCOORD_FMT1_SHIFT           4
+#define S2_TEXCOORD_FMT(unit, type)    ((type)<<(unit*4))
+#define S2_TEXCOORD_NONE               (~0)
+
+#define TEXCOORD_WRAP_SHORTEST_TCX     8
+#define TEXCOORD_WRAP_SHORTEST_TCY     4
+#define TEXCOORD_WRAP_SHORTEST_TCZ     2
+#define TEXCOORD_PERSPECTIVE_DISABLE   1
+
+#define S3_WRAP_SHORTEST_TCX(unit)     (TEXCOORD_WRAP_SHORTEST_TCX << ((unit) * 4))
+#define S3_WRAP_SHORTEST_TCY(unit)     (TEXCOORD_WRAP_SHORTEST_TCY << ((unit) * 4))
+#define S3_WRAP_SHORTEST_TCZ(unit)     (TEXCOORD_WRAP_SHORTEST_TCZ << ((unit) * 4))
+#define S3_PERSPECTIVE_DISABLE(unit)   (TEXCOORD_PERSPECTIVE_DISABLE << ((unit) * 4))
+
+/* S3 not interesting */
+
+#define S4_POINT_WIDTH_SHIFT           23
+#define S4_POINT_WIDTH_MASK            (0x1ff<<23)
+#define S4_LINE_WIDTH_SHIFT            19
+#define S4_LINE_WIDTH_ONE              (0x2<<19)
+#define S4_LINE_WIDTH_MASK             (0xf<<19)
+#define S4_FLATSHADE_ALPHA             (1<<18)
+#define S4_FLATSHADE_FOG               (1<<17)
+#define S4_FLATSHADE_SPECULAR          (1<<16)
+#define S4_FLATSHADE_COLOR             (1<<15)
+#define S4_CULLMODE_BOTH              (0<<13)
+#define S4_CULLMODE_NONE              (1<<13)
+#define S4_CULLMODE_CW                (2<<13)
+#define S4_CULLMODE_CCW                       (3<<13)
+#define S4_CULLMODE_MASK              (3<<13)
+#define S4_VFMT_POINT_WIDTH            (1<<12)
+#define S4_VFMT_SPEC_FOG               (1<<11)
+#define S4_VFMT_COLOR                  (1<<10)
+#define S4_VFMT_DEPTH_OFFSET           (1<<9)
+#define S4_VFMT_XYZ                   (1<<6)
+#define S4_VFMT_XYZW                  (2<<6)
+#define S4_VFMT_XY                            (3<<6)
+#define S4_VFMT_XYW                   (4<<6)
+#define S4_VFMT_XYZW_MASK              (7<<6)
+#define S4_FORCE_DEFAULT_DIFFUSE       (1<<5)
+#define S4_FORCE_DEFAULT_SPECULAR      (1<<4)
+#define S4_LOCAL_DEPTH_OFFSET_ENABLE   (1<<3)
+#define S4_VFMT_FOG_PARAM              (1<<2)
+#define S4_SPRITE_POINT_ENABLE         (1<<1)
+#define S4_LINE_ANTIALIAS_ENABLE       (1<<0)
+
+#define S4_VFMT_MASK (S4_VFMT_POINT_WIDTH   |  \
+                     S4_VFMT_SPEC_FOG      |   \
+                     S4_VFMT_COLOR         |   \
+                     S4_VFMT_DEPTH_OFFSET  |   \
+                     S4_VFMT_XYZW_MASK     |   \
+                     S4_VFMT_FOG_PARAM)
+
+#define S5_WRITEDISABLE_ALPHA          (1<<31)
+#define S5_WRITEDISABLE_RED            (1<<30)
+#define S5_WRITEDISABLE_GREEN          (1<<29)
+#define S5_WRITEDISABLE_BLUE           (1<<28)
+#define S5_WRITEDISABLE_MASK           (0xf<<28)
+#define S5_FORCE_DEFAULT_POINT_SIZE    (1<<27)
+#define S5_LAST_PIXEL_ENABLE           (1<<26)
+#define S5_GLOBAL_DEPTH_OFFSET_ENABLE  (1<<25)
+#define S5_FOG_ENABLE                  (1<<24)
+#define S5_STENCIL_REF_SHIFT           16
+#define S5_STENCIL_REF_MASK            (0xff<<16)
+#define S5_STENCIL_TEST_FUNC_SHIFT     13
+#define S5_STENCIL_TEST_FUNC_MASK      (0x7<<13)
+#define S5_STENCIL_FAIL_SHIFT          10
+#define S5_STENCIL_FAIL_MASK           (0x7<<10)
+#define S5_STENCIL_PASS_Z_FAIL_SHIFT   7
+#define S5_STENCIL_PASS_Z_FAIL_MASK    (0x7<<7)
+#define S5_STENCIL_PASS_Z_PASS_SHIFT   4
+#define S5_STENCIL_PASS_Z_PASS_MASK    (0x7<<4)
+#define S5_STENCIL_WRITE_ENABLE        (1<<3)
+#define S5_STENCIL_TEST_ENABLE         (1<<2)
+#define S5_COLOR_DITHER_ENABLE         (1<<1)
+#define S5_LOGICOP_ENABLE              (1<<0)
+
+#define S6_ALPHA_TEST_ENABLE           (1<<31)
+#define S6_ALPHA_TEST_FUNC_SHIFT       28
+#define S6_ALPHA_TEST_FUNC_MASK        (0x7<<28)
+#define S6_ALPHA_REF_SHIFT             20
+#define S6_ALPHA_REF_MASK              (0xff<<20)
+#define S6_DEPTH_TEST_ENABLE           (1<<19)
+#define S6_DEPTH_TEST_FUNC_SHIFT       16
+#define S6_DEPTH_TEST_FUNC_MASK        (0x7<<16)
+#define S6_CBUF_BLEND_ENABLE           (1<<15)
+#define S6_CBUF_BLEND_FUNC_SHIFT       12
+#define S6_CBUF_BLEND_FUNC_MASK        (0x7<<12)
+#define S6_CBUF_SRC_BLEND_FACT_SHIFT   8
+#define S6_CBUF_SRC_BLEND_FACT_MASK    (0xf<<8)
+#define S6_CBUF_DST_BLEND_FACT_SHIFT   4
+#define S6_CBUF_DST_BLEND_FACT_MASK    (0xf<<4)
+#define S6_DEPTH_WRITE_ENABLE          (1<<3)
+#define S6_COLOR_WRITE_ENABLE          (1<<2)
+#define S6_TRISTRIP_PV_SHIFT           0
+#define S6_TRISTRIP_PV_MASK            (0x3<<0)
+
+#define S7_DEPTH_OFFSET_CONST_MASK     ~0
+
+/* 3DSTATE_MAP_DEINTERLACER_PARAMETERS */
+/* 3DSTATE_MAP_PALETTE_LOAD_32, p206 */
+
+/* _3DSTATE_MODES_4, p218 */
+#define _3DSTATE_MODES_4_CMD           (CMD_3D|(0x0d<<24))
+#define ENABLE_LOGIC_OP_FUNC           (1<<23)
+#define LOGIC_OP_FUNC(x)               ((x)<<18)
+#define LOGICOP_MASK                   (0xf<<18)
+#define LOGICOP_COPY                   0xc
+#define MODE4_ENABLE_STENCIL_TEST_MASK ((1<<17)|(0xff00))
+#define ENABLE_STENCIL_TEST_MASK       (1<<17)
+#define STENCIL_TEST_MASK(x)           ((x)<<8)
+#define MODE4_ENABLE_STENCIL_WRITE_MASK        ((1<<16)|(0x00ff))
+#define ENABLE_STENCIL_WRITE_MASK      (1<<16)
+#define STENCIL_WRITE_MASK(x)          ((x)&0xff)
+
+/* _3DSTATE_MODES_5, p220 */
+#define _3DSTATE_MODES_5_CMD           (CMD_3D|(0x0c<<24))
+#define PIPELINE_FLUSH_RENDER_CACHE    (1<<18)
+#define PIPELINE_FLUSH_TEXTURE_CACHE   (1<<16)
+
+/* p221 */
+#define _3DSTATE_PIXEL_SHADER_CONSTANTS  (CMD_3D|(0x1d<<24)|(0x6<<16))
+#define PS1_REG(n)                      (1<<(n))
+#define PS2_CONST_X(n)                  (n)
+#define PS3_CONST_Y(n)                  (n)
+#define PS4_CONST_Z(n)                  (n)
+#define PS5_CONST_W(n)                  (n)
+
+/* p222 */
+
+#define I915_MAX_TEX_INDIRECT 4
+#define I915_MAX_TEX_INSN     32
+#define I915_MAX_ALU_INSN     64
+#define I915_MAX_DECL_INSN    27
+#define I915_MAX_TEMPORARY    16
+
+/* Each instruction is 3 dwords long, though most don't require all
+ * this space.  Maximum of 123 instructions.  Smaller maxes per insn
+ * type.
+ */
+#define _3DSTATE_PIXEL_SHADER_PROGRAM    (CMD_3D|(0x1d<<24)|(0x5<<16))
+
+#define REG_TYPE_R                 0   /* temporary regs, no need to
+                                        * dcl, must be written before
+                                        * read -- Preserved between
+                                        * phases.
+                                        */
+#define REG_TYPE_T                 1   /* Interpolated values, must be
+                                        * dcl'ed before use.
+                                        *
+                                        * 0..7: texture coord,
+                                        * 8: diffuse spec,
+                                        * 9: specular color,
+                                        * 10: fog parameter in w.
+                                        */
+#define REG_TYPE_CONST             2   /* Restriction: only one const
+                                        * can be referenced per
+                                        * instruction, though it may be
+                                        * selected for multiple inputs.
+                                        * Constants not initialized
+                                        * default to zero.
+                                        */
+#define REG_TYPE_S                 3   /* sampler */
+#define REG_TYPE_OC                4   /* output color (rgba) */
+#define REG_TYPE_OD                5   /* output depth (w), xyz are
+                                        * temporaries.  If not written,
+                                        * interpolated depth is used?
+                                        */
+#define REG_TYPE_U                 6   /* unpreserved temporaries */
+#define REG_TYPE_MASK              0x7
+#define REG_NR_MASK                0xf
+
+/* REG_TYPE_T:
+ */
+#define T_TEX0     0
+#define T_TEX1     1
+#define T_TEX2     2
+#define T_TEX3     3
+#define T_TEX4     4
+#define T_TEX5     5
+#define T_TEX6     6
+#define T_TEX7     7
+#define T_DIFFUSE  8
+#define T_SPECULAR 9
+#define T_FOG_W    10          /* interpolated fog is in W coord */
+
+/* Arithmetic instructions */
+
+/* .replicate_swizzle == selection and replication of a particular
+ * scalar channel, ie., .xxxx, .yyyy, .zzzz or .wwww
+ */
+#define A0_NOP    (0x0<<24)    /* no operation */
+#define A0_ADD    (0x1<<24)    /* dst = src0 + src1 */
+#define A0_MOV    (0x2<<24)    /* dst = src0 */
+#define A0_MUL    (0x3<<24)    /* dst = src0 * src1 */
+#define A0_MAD    (0x4<<24)    /* dst = src0 * src1 + src2 */
+#define A0_DP2ADD (0x5<<24)    /* dst.xyzw = src0.xy dot src1.xy + src2.replicate_swizzle */
+#define A0_DP3    (0x6<<24)    /* dst.xyzw = src0.xyz dot src1.xyz */
+#define A0_DP4    (0x7<<24)    /* dst.xyzw = src0.xyzw dot src1.xyzw */
+#define A0_FRC    (0x8<<24)    /* dst = src0 - floor(src0) */
+#define A0_RCP    (0x9<<24)    /* dst.xyzw = 1/(src0.replicate_swizzle) */
+#define A0_RSQ    (0xa<<24)    /* dst.xyzw = 1/(sqrt(abs(src0.replicate_swizzle))) */
+#define A0_EXP    (0xb<<24)    /* dst.xyzw = exp2(src0.replicate_swizzle) */
+#define A0_LOG    (0xc<<24)    /* dst.xyzw = log2(abs(src0.replicate_swizzle)) */
+#define A0_CMP    (0xd<<24)    /* dst = (src0 >= 0.0) ? src1 : src2 */
+#define A0_MIN    (0xe<<24)    /* dst = (src0 < src1) ? src0 : src1 */
+#define A0_MAX    (0xf<<24)    /* dst = (src0 >= src1) ? src0 : src1 */
+#define A0_FLR    (0x10<<24)   /* dst = floor(src0) */
+#define A0_MOD    (0x11<<24)   /* dst = src0 fmod 1.0 */
+#define A0_TRC    (0x12<<24)   /* dst = int(src0) */
+#define A0_SGE    (0x13<<24)   /* dst = src0 >= src1 ? 1.0 : 0.0 */
+#define A0_SLT    (0x14<<24)   /* dst = src0 < src1 ? 1.0 : 0.0 */
+#define A0_DEST_SATURATE                 (1<<22)
+#define A0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+#define A0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define A0_DEST_CHANNEL_X                (1<<10)
+#define A0_DEST_CHANNEL_Y                (2<<10)
+#define A0_DEST_CHANNEL_Z                (4<<10)
+#define A0_DEST_CHANNEL_W                (8<<10)
+#define A0_DEST_CHANNEL_ALL              (0xf<<10)
+#define A0_DEST_CHANNEL_SHIFT            10
+#define A0_SRC0_TYPE_SHIFT               7
+#define A0_SRC0_NR_SHIFT                 2
+
+#define A0_DEST_CHANNEL_XY              (A0_DEST_CHANNEL_X|A0_DEST_CHANNEL_Y)
+#define A0_DEST_CHANNEL_XYZ             (A0_DEST_CHANNEL_XY|A0_DEST_CHANNEL_Z)
+
+#define SRC_X        0
+#define SRC_Y        1
+#define SRC_Z        2
+#define SRC_W        3
+#define SRC_ZERO     4
+#define SRC_ONE      5
+
+#define A1_SRC0_CHANNEL_X_NEGATE         (1<<31)
+#define A1_SRC0_CHANNEL_X_SHIFT          28
+#define A1_SRC0_CHANNEL_Y_NEGATE         (1<<27)
+#define A1_SRC0_CHANNEL_Y_SHIFT          24
+#define A1_SRC0_CHANNEL_Z_NEGATE         (1<<23)
+#define A1_SRC0_CHANNEL_Z_SHIFT          20
+#define A1_SRC0_CHANNEL_W_NEGATE         (1<<19)
+#define A1_SRC0_CHANNEL_W_SHIFT          16
+#define A1_SRC1_TYPE_SHIFT               13
+#define A1_SRC1_NR_SHIFT                 8
+#define A1_SRC1_CHANNEL_X_NEGATE         (1<<7)
+#define A1_SRC1_CHANNEL_X_SHIFT          4
+#define A1_SRC1_CHANNEL_Y_NEGATE         (1<<3)
+#define A1_SRC1_CHANNEL_Y_SHIFT          0
+
+#define A2_SRC1_CHANNEL_Z_NEGATE         (1<<31)
+#define A2_SRC1_CHANNEL_Z_SHIFT          28
+#define A2_SRC1_CHANNEL_W_NEGATE         (1<<27)
+#define A2_SRC1_CHANNEL_W_SHIFT          24
+#define A2_SRC2_TYPE_SHIFT               21
+#define A2_SRC2_NR_SHIFT                 16
+#define A2_SRC2_CHANNEL_X_NEGATE         (1<<15)
+#define A2_SRC2_CHANNEL_X_SHIFT          12
+#define A2_SRC2_CHANNEL_Y_NEGATE         (1<<11)
+#define A2_SRC2_CHANNEL_Y_SHIFT          8
+#define A2_SRC2_CHANNEL_Z_NEGATE         (1<<7)
+#define A2_SRC2_CHANNEL_Z_SHIFT          4
+#define A2_SRC2_CHANNEL_W_NEGATE         (1<<3)
+#define A2_SRC2_CHANNEL_W_SHIFT          0
+
+/* Texture instructions */
+#define T0_TEXLD     (0x15<<24)        /* Sample texture using predeclared
+                                * sampler and address, and output
+                                * filtered texel data to destination
+                                * register */
+#define T0_TEXLDP    (0x16<<24)        /* Same as texld but performs a
+                                * perspective divide of the texture
+                                * coordinate .xyz values by .w before
+                                * sampling. */
+#define T0_TEXLDB    (0x17<<24)        /* Same as texld but biases the
+                                * computed LOD by w.  Only S4.6 two's
+                                * comp is used.  This implies that a
+                                * float to fixed conversion is
+                                * done. */
+#define T0_TEXKILL   (0x18<<24)        /* Does not perform a sampling
+                                * operation.  Simply kills the pixel
+                                * if any channel of the address
+                                * register is < 0.0. */
+#define T0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+/* Note: U (unpreserved) regs do not retain their values between
+ * phases (cannot be used for feedback)
+ *
+ * Note: oC and OD registers can only be used as the destination of a
+ * texture instruction once per phase (this is an implementation
+ * restriction).
+ */
+#define T0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define T0_SAMPLER_NR_SHIFT              0     /* This field ignored for TEXKILL */
+#define T0_SAMPLER_NR_MASK               (0xf<<0)
+
+#define T1_ADDRESS_REG_TYPE_SHIFT        24    /* Reg to use as texture coord */
+/* Allow R, T, OC, OD -- R, OC, OD are 'dependent' reads, new program phase */
+#define T1_ADDRESS_REG_NR_SHIFT          17
+#define T2_MBZ                           0
+
+/* Declaration instructions */
+#define D0_DCL       (0x19<<24)        /* Declare a t (interpolated attrib)
+                                * register or an s (sampler)
+                                * register. */
+#define D0_SAMPLE_TYPE_SHIFT              22
+#define D0_SAMPLE_TYPE_2D                 (0x0<<22)
+#define D0_SAMPLE_TYPE_CUBE               (0x1<<22)
+#define D0_SAMPLE_TYPE_VOLUME             (0x2<<22)
+#define D0_SAMPLE_TYPE_MASK               (0x3<<22)
+
+#define D0_TYPE_SHIFT                19
+/* Allow: T, S */
+#define D0_NR_SHIFT                  14
+/* Allow T: 0..10, S: 0..15 */
+#define D0_CHANNEL_X                (1<<10)
+#define D0_CHANNEL_Y                (2<<10)
+#define D0_CHANNEL_Z                (4<<10)
+#define D0_CHANNEL_W                (8<<10)
+#define D0_CHANNEL_ALL              (0xf<<10)
+#define D0_CHANNEL_NONE             (0<<10)
+
+#define D0_CHANNEL_XY               (D0_CHANNEL_X|D0_CHANNEL_Y)
+#define D0_CHANNEL_XYZ              (D0_CHANNEL_XY|D0_CHANNEL_Z)
+
+/* I915 Errata: Do not allow (xz), (xw), (xzw) combinations for diffuse
+ * or specular declarations.
+ *
+ * For T dcls, only allow: (x), (xy), (xyz), (w), (xyzw)
+ *
+ * Must be zero for S (sampler) dcls
+ */
+#define D1_MBZ                          0
+#define D2_MBZ                          0
+
+/* p207.
+ * The DWORD count is 3 times the number of bits set in MS1_MAPMASK_MASK
+ */
+#define _3DSTATE_MAP_STATE               (CMD_3D|(0x1d<<24)|(0x0<<16))
+
+#define MS1_MAPMASK_SHIFT               0
+#define MS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define MS2_UNTRUSTED_SURFACE           (1<<31)
+#define MS2_ADDRESS_MASK                0xfffffffc
+#define MS2_VERTICAL_LINE_STRIDE        (1<<1)
+#define MS2_VERTICAL_OFFSET             (1<<1)
+
+#define MS3_HEIGHT_SHIFT              21
+#define MS3_WIDTH_SHIFT               10
+#define MS3_PALETTE_SELECT            (1<<9)
+#define MS3_MAPSURF_FORMAT_SHIFT      7
+#define MS3_MAPSURF_FORMAT_MASK       (0x7<<7)
+#define    MAPSURF_8BIT                           (1<<7)
+#define    MAPSURF_16BIT                  (2<<7)
+#define    MAPSURF_32BIT                  (3<<7)
+#define    MAPSURF_422                    (5<<7)
+#define    MAPSURF_COMPRESSED             (6<<7)
+#define    MAPSURF_4BIT_INDEXED                   (7<<7)
+#define MS3_MT_FORMAT_MASK         (0x7 << 3)
+#define MS3_MT_FORMAT_SHIFT        3
+#define    MT_4BIT_IDX_ARGB8888                   (7<<3)       /* SURFACE_4BIT_INDEXED */
+#define    MT_8BIT_I8                     (0<<3)       /* SURFACE_8BIT */
+#define    MT_8BIT_L8                     (1<<3)
+#define    MT_8BIT_A8                     (4<<3)
+#define    MT_8BIT_MONO8                  (5<<3)
+#define    MT_16BIT_RGB565                (0<<3)       /* SURFACE_16BIT */
+#define    MT_16BIT_ARGB1555              (1<<3)
+#define    MT_16BIT_ARGB4444              (2<<3)
+#define    MT_16BIT_AY88                  (3<<3)
+#define    MT_16BIT_88DVDU                (5<<3)
+#define    MT_16BIT_BUMP_655LDVDU         (6<<3)
+#define    MT_16BIT_I16                           (7<<3)
+#define    MT_16BIT_L16                           (8<<3)
+#define    MT_16BIT_A16                           (9<<3)
+#define    MT_32BIT_ARGB8888              (0<<3)       /* SURFACE_32BIT */
+#define    MT_32BIT_ABGR8888              (1<<3)
+#define    MT_32BIT_XRGB8888              (2<<3)
+#define    MT_32BIT_XBGR8888              (3<<3)
+#define    MT_32BIT_QWVU8888              (4<<3)
+#define    MT_32BIT_AXVU8888              (5<<3)
+#define    MT_32BIT_LXVU8888              (6<<3)
+#define    MT_32BIT_XLVU8888              (7<<3)
+#define    MT_32BIT_ARGB2101010                   (8<<3)
+#define    MT_32BIT_ABGR2101010                   (9<<3)
+#define    MT_32BIT_AWVU2101010                   (0xA<<3)
+#define    MT_32BIT_GR1616                (0xB<<3)
+#define    MT_32BIT_VU1616                (0xC<<3)
+#define    MT_32BIT_xI824                 (0xD<<3)
+#define    MT_32BIT_xA824                 (0xE<<3)
+#define    MT_32BIT_xL824                 (0xF<<3)
+#define    MT_422_YCRCB_SWAPY             (0<<3)       /* SURFACE_422 */
+#define    MT_422_YCRCB_NORMAL            (1<<3)
+#define    MT_422_YCRCB_SWAPUV            (2<<3)
+#define    MT_422_YCRCB_SWAPUVY                   (3<<3)
+#define    MT_COMPRESS_DXT1               (0<<3)       /* SURFACE_COMPRESSED */
+#define    MT_COMPRESS_DXT2_3             (1<<3)
+#define    MT_COMPRESS_DXT4_5             (2<<3)
+#define    MT_COMPRESS_FXT1               (3<<3)
+#define    MT_COMPRESS_DXT1_RGB                   (4<<3)
+#define MS3_USE_FENCE_REGS              (1<<2)
+#define MS3_TILED_SURFACE             (1<<1)
+#define MS3_TILE_WALK                 (1<<0)
+
+/* The pitch is the pitch measured in DWORDS, minus 1 */
+#define MS4_PITCH_SHIFT                 21
+#define MS4_CUBE_FACE_ENA_NEGX          (1<<20)
+#define MS4_CUBE_FACE_ENA_POSX          (1<<19)
+#define MS4_CUBE_FACE_ENA_NEGY          (1<<18)
+#define MS4_CUBE_FACE_ENA_POSY          (1<<17)
+#define MS4_CUBE_FACE_ENA_NEGZ          (1<<16)
+#define MS4_CUBE_FACE_ENA_POSZ          (1<<15)
+#define MS4_CUBE_FACE_ENA_MASK          (0x3f<<15)
+#define MS4_MAX_LOD_SHIFT              9
+#define MS4_MAX_LOD_MASK               (0x3f<<9)
+#define MS4_MIP_LAYOUT_LEGACY           (0<<8)
+#define MS4_MIP_LAYOUT_BELOW_LPT        (0<<8)
+#define MS4_MIP_LAYOUT_RIGHT_LPT        (1<<8)
+#define MS4_VOLUME_DEPTH_SHIFT          0
+#define MS4_VOLUME_DEPTH_MASK           (0xff<<0)
+
+/* p244.
+ * The DWORD count is 3 times the number of bits set in SS1_MAPMASK_MASK.
+ */
+#define _3DSTATE_SAMPLER_STATE         (CMD_3D|(0x1d<<24)|(0x1<<16))
+
+#define SS1_MAPMASK_SHIFT               0
+#define SS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define SS2_REVERSE_GAMMA_ENABLE        (1<<31)
+#define SS2_PACKED_TO_PLANAR_ENABLE     (1<<30)
+#define SS2_COLORSPACE_CONVERSION       (1<<29)
+#define SS2_CHROMAKEY_SHIFT             27
+#define SS2_BASE_MIP_LEVEL_SHIFT        22
+#define SS2_BASE_MIP_LEVEL_MASK         (0x1f<<22)
+#define SS2_MIP_FILTER_SHIFT            20
+#define SS2_MIP_FILTER_MASK             (0x3<<20)
+#define   MIPFILTER_NONE               0
+#define   MIPFILTER_NEAREST    1
+#define   MIPFILTER_LINEAR     3
+#define SS2_MAG_FILTER_SHIFT          17
+#define SS2_MAG_FILTER_MASK           (0x7<<17)
+#define   FILTER_NEAREST       0
+#define   FILTER_LINEAR                1
+#define   FILTER_ANISOTROPIC   2
+#define   FILTER_4X4_1         3
+#define   FILTER_4X4_2         4
+#define   FILTER_4X4_FLAT      5
+#define   FILTER_6X5_MONO      6       /* XXX - check */
+#define SS2_MIN_FILTER_SHIFT          14
+#define SS2_MIN_FILTER_MASK           (0x7<<14)
+#define SS2_LOD_BIAS_SHIFT            5
+#define SS2_LOD_BIAS_ONE              (0x10<<5)
+#define SS2_LOD_BIAS_MASK             (0x1ff<<5)
+/* Shadow requires:
+ *  MT_X8{I,L,A}24 or MT_{I,L,A}16 texture format
+ *  FILTER_4X4_x  MIN and MAG filters
+ */
+#define SS2_SHADOW_ENABLE             (1<<4)
+#define SS2_MAX_ANISO_MASK            (1<<3)
+#define SS2_MAX_ANISO_2               (0<<3)
+#define SS2_MAX_ANISO_4               (1<<3)
+#define SS2_SHADOW_FUNC_SHIFT         0
+#define SS2_SHADOW_FUNC_MASK          (0x7<<0)
+/* SS2_SHADOW_FUNC values: see COMPAREFUNC_* */
+
+#define SS3_MIN_LOD_SHIFT            24
+#define SS3_MIN_LOD_ONE              (0x10<<24)
+#define SS3_MIN_LOD_MASK             (0xff<<24)
+#define SS3_KILL_PIXEL_ENABLE        (1<<17)
+#define SS3_TCX_ADDR_MODE_SHIFT      12
+#define SS3_TCX_ADDR_MODE_MASK       (0x7<<12)
+#define   TEXCOORDMODE_WRAP            0
+#define   TEXCOORDMODE_MIRROR          1
+#define   TEXCOORDMODE_CLAMP_EDGE      2
+#define   TEXCOORDMODE_CUBE            3
+#define   TEXCOORDMODE_CLAMP_BORDER    4
+#define   TEXCOORDMODE_MIRROR_ONCE      5
+#define SS3_TCY_ADDR_MODE_SHIFT      9
+#define SS3_TCY_ADDR_MODE_MASK       (0x7<<9)
+#define SS3_TCZ_ADDR_MODE_SHIFT      6
+#define SS3_TCZ_ADDR_MODE_MASK       (0x7<<6)
+#define SS3_NORMALIZED_COORDS        (1<<5)
+#define SS3_TEXTUREMAP_INDEX_SHIFT   1
+#define SS3_TEXTUREMAP_INDEX_MASK    (0xf<<1)
+#define SS3_DEINTERLACER_ENABLE      (1<<0)
+
+#define SS4_BORDER_COLOR_MASK        (~0)
+
+/* 3DSTATE_SPAN_STIPPLE, p258
+ */
+#define _3DSTATE_STIPPLE           ((0x3<<29)|(0x1d<<24)|(0x83<<16))
+#define ST1_ENABLE               (1<<16)
+#define ST1_MASK                 (0xffff)
+
+#define FLUSH_MAP_CACHE    (1<<0)
+#define FLUSH_RENDER_CACHE (1<<1)
+
+#endif
index 3fe1248..aaa57c5 100644 (file)
@@ -66,6 +66,8 @@
 #include "intel_bufmgr.h"
 #include "intel_batchbuffer.h"
 #include "intel_gpu_tools.h"
+#include "i915_reg.h"
+#include "i915_3d.h"
 
 #define CMD_POLY_STIPPLE_OFFSET       0x7906
 
@@ -202,7 +204,7 @@ static void cpucpy2d(uint32_t *src, unsigned src_stride, unsigned src_x, unsigne
                        unsigned src_ofs = src_x + j + src_stride * (src_y + i);
                        unsigned expect = logical_tile_no*TILE_SIZE*TILE_SIZE
                            + i*TILE_SIZE + j;
-                       uint32_t tmp = src[src_ofs]; 
+                       uint32_t tmp = src[src_ofs];
                        if (tmp != expect) {
                            printf("mismatch at tile %i pos %i, read %i, expected %i, diff %i\n",
                                    logical_tile_no, i*TILE_SIZE + j, tmp, expect, (int) tmp - expect);
@@ -293,10 +295,10 @@ static void blitter_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned s
                  dst_pitch);
        OUT_BATCH(dst_y << 16 | dst_x);
        OUT_BATCH((dst_y+TILE_SIZE) << 16 | (dst_x+TILE_SIZE));
-       OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
+       OUT_RELOC_FENCED(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
        OUT_BATCH(src_y << 16 | src_x);
        OUT_BATCH(src_pitch);
-       OUT_RELOC(src->bo, I915_GEM_DOMAIN_RENDER, 0, 0);
+       OUT_RELOC_FENCED(src->bo, I915_GEM_DOMAIN_RENDER, 0, 0);
        ADVANCE_BATCH();
 
        if (!(keep_gpu_busy_counter & 1) && !fence_storm)
@@ -315,6 +317,241 @@ static void blitter_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned s
        }
 }
 
+static unsigned buf_width(struct scratch_buf *buf)
+{
+       return buf->stride/sizeof(uint32_t);
+}
+
+static unsigned buf_height(struct scratch_buf *buf)
+{
+       return options.scratch_buf_size/buf->stride;
+}
+
+static void emit_vertex(float f)
+{
+       union { float f; uint32_t ui; } u;
+       u.f = f;
+       OUT_BATCH(u.ui);
+}
+
+static void emit_vertex_normalized(float f, float total)
+{
+       union { float f; uint32_t ui; } u;
+       u.f = f / total;
+       OUT_BATCH(u.ui);
+}
+
+static void gen3_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
+                                struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
+                                unsigned logical_tile_no)
+{
+       uint32_t src_pitch, dst_pitch, cmd_bits;
+       src_pitch = src->stride;
+       dst_pitch = dst->stride;
+       cmd_bits =  0;
+       static unsigned keep_gpu_busy_counter = 0;
+
+       /* check both edges of the fence usage */
+       if (keep_gpu_busy_counter & 1 && !fence_storm)
+               keep_gpu_busy();
+
+       /* invariant state */
+       {
+               OUT_BATCH(_3DSTATE_AA_CMD |
+                         AA_LINE_ECAAR_WIDTH_ENABLE |
+                         AA_LINE_ECAAR_WIDTH_1_0 |
+                         AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+               OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                         IAB_MODIFY_ENABLE |
+                         IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) |
+                         IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE <<
+                                                  IAB_SRC_FACTOR_SHIFT) |
+                         IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO <<
+                                                  IAB_DST_FACTOR_SHIFT));
+               OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
+                         CSB_TCB(0, 0) |
+                         CSB_TCB(1, 1) |
+                         CSB_TCB(2, 2) |
+                         CSB_TCB(3, 3) |
+                         CSB_TCB(4, 4) |
+                         CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
+               OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
+                         ENABLE_POINT_RASTER_RULE |
+                         OGL_POINT_RASTER_RULE |
+                         ENABLE_LINE_STRIP_PROVOKE_VRTX |
+                         ENABLE_TRI_FAN_PROVOKE_VRTX |
+                         LINE_STRIP_PROVOKE_VRTX(1) |
+                         TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
+               OUT_BATCH(_3DSTATE_MODES_4_CMD |
+                         ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) |
+                         ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
+                         ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
+               OUT_BATCH(0x00000000);  /* Disable texture coordinate wrap-shortest */
+               OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
+                         S4_LINE_WIDTH_ONE |
+                         S4_CULLMODE_NONE |
+                         S4_VFMT_XY);
+               OUT_BATCH(0x00000000);  /* Stencil. */
+               OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
+               OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+               OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);  /* disable indirect state */
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_STIPPLE);
+               OUT_BATCH(0x00000000);
+               OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
+       }
+
+       /* samler state */
+       {
+#define TEX_COUNT 1
+               uint32_t tiling_bits = 0;
+               if (src->tiling != I915_TILING_NONE)
+                       tiling_bits = MS3_TILED_SURFACE;
+               if (src->tiling == I915_TILING_Y)
+                       tiling_bits |= MS3_TILE_WALK;
+
+               OUT_BATCH(_3DSTATE_MAP_STATE | (3 * TEX_COUNT));
+               OUT_BATCH((1 << TEX_COUNT) - 1);
+               OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0);
+               OUT_BATCH(MAPSURF_32BIT | MT_32BIT_ARGB8888 |
+                         tiling_bits |
+                         (buf_height(src) - 1) << MS3_HEIGHT_SHIFT |
+                         (buf_width(src) - 1) << MS3_WIDTH_SHIFT);
+               OUT_BATCH((src->stride/4-1) << MS4_PITCH_SHIFT);
+
+               OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT));
+               OUT_BATCH((1 << TEX_COUNT) - 1);
+               OUT_BATCH(MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT |
+                         FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
+                         FILTER_NEAREST << SS2_MIN_FILTER_SHIFT);
+               OUT_BATCH(SS3_NORMALIZED_COORDS |
+                         TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT |
+                         TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT |
+                         0 << SS3_TEXTUREMAP_INDEX_SHIFT);
+               OUT_BATCH(0x00000000);
+       }
+
+       /* render target state */
+       {
+               uint32_t tiling_bits = 0;
+               if (dst->tiling != I915_TILING_NONE)
+                       tiling_bits = BUF_3D_TILED_SURFACE;
+               if (dst->tiling == I915_TILING_Y)
+                       tiling_bits |= BUF_3D_TILE_WALK_Y;
+
+               OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+               OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling_bits |
+                         BUF_3D_PITCH(dst->stride));
+               OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
+
+               OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+               OUT_BATCH(COLR_BUF_ARGB8888 |
+                         DSTORG_HORT_BIAS(0x8) |
+                         DSTORG_VERT_BIAS(0x8));
+
+               /* draw rect is unconditional */
+               OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+               OUT_BATCH(0x00000000);
+               OUT_BATCH(0x00000000);  /* ymin, xmin */
+               OUT_BATCH(DRAW_YMAX(buf_height(dst) - 1) |
+                         DRAW_XMAX(buf_width(dst) - 1));
+               /* yorig, xorig (relate to color buffer?) */
+               OUT_BATCH(0x00000000);
+       }
+
+       /* texfmt */
+       {
+               uint32_t ss2 = ~0;
+               ss2 &= ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT);
+               ss2 |= S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D);
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
+               OUT_BATCH(ss2);
+               OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
+                         BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
+                         BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT |
+                         BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT);
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+                         I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
+               OUT_BATCH(0); /* no vbo */
+               OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) |
+                         (4 << S1_VERTEX_PITCH_SHIFT));
+       }
+
+       /* frage shader */
+       {
+               OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2));
+               /* decl FS_T0 */
+               OUT_BATCH(D0_DCL |
+                         REG_TYPE(FS_T0) << D0_TYPE_SHIFT |
+                         REG_NR(FS_T0) << D0_NR_SHIFT |
+                         ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               /* decl FS_S0 */
+               OUT_BATCH(D0_DCL |
+                         (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) |
+                         (REG_NR(FS_S0) << D0_NR_SHIFT) |
+                         ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               /* texld(FS_OC, FS_S0, FS_T0 */
+               OUT_BATCH(T0_TEXLD |
+                         (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) |
+                         (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) |
+                         (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT));
+               OUT_BATCH((REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) |
+                         (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT));
+               OUT_BATCH(0);
+       }
+
+       OUT_BATCH(PRIM3D_RECTLIST | (3*4 - 1));
+       emit_vertex(dst_x + TILE_SIZE);
+       emit_vertex(dst_y + TILE_SIZE);
+       emit_vertex_normalized(src_x + TILE_SIZE, buf_width(src));
+       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
+
+       emit_vertex(dst_x);
+       emit_vertex(dst_y + TILE_SIZE);
+       emit_vertex_normalized(src_x, buf_width(src));
+       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
+
+       emit_vertex(dst_x);
+       emit_vertex(dst_y);
+       emit_vertex_normalized(src_x, buf_width(src));
+       emit_vertex_normalized(src_y, buf_height(src));
+
+       if (!(keep_gpu_busy_counter & 1) && !fence_storm)
+               keep_gpu_busy();
+
+       keep_gpu_busy_counter++;
+
+       intel_batchbuffer_flush(batch);
+}
+
+static void render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
+                           struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
+                           unsigned logical_tile_no)
+{
+       if (IS_GEN3(devid))
+               gen3_render_copyfunc(src, src_x, src_y,
+                                    dst, dst_x, dst_y,
+                                    logical_tile_no);
+       else
+               blitter_copyfunc(src, src_x, src_y,
+                                dst, dst_x, dst_y,
+                                logical_tile_no);
+}
+
 static void next_copyfunc(int tile)
 {
        if (fence_storm) {
@@ -337,6 +574,10 @@ static void next_copyfunc(int tile)
                if (tile == options.trace_tile)
                        printf(" using prw\n");
                copyfunc = prw_copyfunc;
+       } else if (copyfunc_seq % 3 == 0) {
+               if (tile == options.trace_tile)
+                       printf(" using render\n");
+               copyfunc = render_copyfunc;
        } else {
                if (tile == options.trace_tile)
                        printf(" using blitter\n");
@@ -433,6 +674,7 @@ static void exchange_buf(void *array, unsigned i, unsigned j)
        memcpy(&buf_arr[j], &tmp, sizeof(struct scratch_buf));
 }
 
+
 /* libdrm is to clever and prevents us from changin tiling of buffers already
  * used in relocations. */
 static void set_tiling(drm_intel_bo *bo, unsigned *tiling, unsigned stride)
@@ -472,7 +714,6 @@ static void init_set(unsigned set)
                        gpu_busy_load = 6;
        }
 
-
        for (i = 0; i < num_buffers; i++) {
                r = random();
                if ((r & 3) != 0)
@@ -664,6 +905,7 @@ static void init(void)
 
        bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
        drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+       drm_intel_bufmgr_gem_enable_fenced_relocs(bufmgr);
        devid = intel_get_drm_devid(drm_fd);
        num_fences = get_num_fences();
        batch = intel_batchbuffer_alloc(bufmgr, devid);