gem_stress: split out gen3 render copy
authorDaniel Vetter <daniel.vetter@ffwll.ch>
Thu, 31 Mar 2011 21:29:25 +0000 (23:29 +0200)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Fri, 1 Apr 2011 11:07:43 +0000 (13:07 +0200)
Header conflicts with other headers for different generations.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
tests/Makefile.am
tests/gem_stress.c
tests/gem_stress.h [new file with mode: 0644]
tests/gem_stress_i915.c [new file with mode: 0644]

index ce2f891..b09264e 100644 (file)
@@ -1,7 +1,12 @@
-bin_PROGRAMS = \
+noinst_PROGRAMS = \
        gem_stress \
        $(NULL)
 
+gem_stress_SOURCES = \
+       gem_stress.c \
+       gem_stress.h \
+       gem_stress_i915.c
+
 TESTS = getversion \
        getclient \
        getstats \
index 61e4b3c..e359d8d 100644 (file)
  * In short: designed for maximum evilness.
  */
 
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <getopt.h>
-#include "drm.h"
-#include "i915_drm.h"
-#include "drmtest.h"
-#include "intel_bufmgr.h"
-#include "intel_batchbuffer.h"
-#include "intel_gpu_tools.h"
-#include "i915_reg.h"
-#include "i915_3d.h"
+#include "gem_stress.h"
 
 #define CMD_POLY_STIPPLE_OFFSET       0x7906
 
@@ -92,36 +74,15 @@ static uint64_t gem_aperture_size(int fd)
        return aperture.aper_size;
 }
 
-struct scratch_buf {
-    drm_intel_bo *bo;
-    uint32_t stride;
-    uint32_t tiling;
-    uint32_t *data;
-    unsigned num_tiles;
-};
-
-static drm_intel_bufmgr *bufmgr;
+drm_intel_bufmgr *bufmgr;
 struct intel_batchbuffer *batch;
-static int drm_fd;
-static int devid;
-static int num_fences;
+int drm_fd;
+int devid;
+int num_fences;
 
 drm_intel_bo *busy_bo;
 
-static struct {
-    unsigned scratch_buf_size;
-    unsigned num_buffers;
-    int trace_tile;
-    int no_hw;
-    int gpu_busy_load;
-    int use_render;
-} options;
-
-#define MAX_BUFS               4096
-#define SCRATCH_BUF_SIZE       1024*1024
-#define BUSY_BUF_SIZE          (256*4096)
-#define TILE_SIZE              16
-#define TILE_BYTES             (TILE_SIZE*TILE_SIZE*sizeof(uint32_t))
+struct option_struct options;
 
 static struct scratch_buf buffers[2][MAX_BUFS];
 /* tile i is at logical position tile_permutation[i] */
@@ -133,7 +94,7 @@ static unsigned num_total_tiles = 0;
 
 #define TILES_PER_BUF          (num_total_tiles / num_buffers)
 
-static int fence_storm = 0;
+int fence_storm = 0;
 static int gpu_busy_load = 10;
 
 static void tile2xy(struct scratch_buf *buf, unsigned tile, unsigned *x, unsigned *y)
@@ -145,7 +106,7 @@ static void tile2xy(struct scratch_buf *buf, unsigned tile, unsigned *x, unsigne
 
 /* All this gem trashing wastes too much cpu time, so give the gpu something to
  * do to increase changes for races. */
-static void keep_gpu_busy(void)
+void keep_gpu_busy(void)
 {
        uint32_t src_pitch, dst_pitch, cmd_bits;
        int tmp;
@@ -318,227 +279,6 @@ static void blitter_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned s
        }
 }
 
-static unsigned buf_width(struct scratch_buf *buf)
-{
-       return buf->stride/sizeof(uint32_t);
-}
-
-static unsigned buf_height(struct scratch_buf *buf)
-{
-       return options.scratch_buf_size/buf->stride;
-}
-
-static void emit_vertex(float f)
-{
-       union { float f; uint32_t ui; } u;
-       u.f = f;
-       OUT_BATCH(u.ui);
-}
-
-static void emit_vertex_normalized(float f, float total)
-{
-       union { float f; uint32_t ui; } u;
-       u.f = f / total;
-       OUT_BATCH(u.ui);
-}
-
-static void gen3_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
-                                struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
-                                unsigned logical_tile_no)
-{
-       uint32_t src_pitch, dst_pitch, cmd_bits;
-       src_pitch = src->stride;
-       dst_pitch = dst->stride;
-       cmd_bits =  0;
-       static unsigned keep_gpu_busy_counter = 0;
-
-       /* check both edges of the fence usage */
-       if (keep_gpu_busy_counter & 1 && !fence_storm)
-               keep_gpu_busy();
-
-       /* invariant state */
-       {
-               OUT_BATCH(_3DSTATE_AA_CMD |
-                         AA_LINE_ECAAR_WIDTH_ENABLE |
-                         AA_LINE_ECAAR_WIDTH_1_0 |
-                         AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
-               OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
-                         IAB_MODIFY_ENABLE |
-                         IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) |
-                         IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE <<
-                                                  IAB_SRC_FACTOR_SHIFT) |
-                         IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO <<
-                                                  IAB_DST_FACTOR_SHIFT));
-               OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
-               OUT_BATCH(0);
-               OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
-               OUT_BATCH(0);
-               OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
-               OUT_BATCH(0);
-               OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
-                         CSB_TCB(0, 0) |
-                         CSB_TCB(1, 1) |
-                         CSB_TCB(2, 2) |
-                         CSB_TCB(3, 3) |
-                         CSB_TCB(4, 4) |
-                         CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
-               OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-                         ENABLE_POINT_RASTER_RULE |
-                         OGL_POINT_RASTER_RULE |
-                         ENABLE_LINE_STRIP_PROVOKE_VRTX |
-                         ENABLE_TRI_FAN_PROVOKE_VRTX |
-                         LINE_STRIP_PROVOKE_VRTX(1) |
-                         TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
-               OUT_BATCH(_3DSTATE_MODES_4_CMD |
-                         ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) |
-                         ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
-                         ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
-               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
-               OUT_BATCH(0x00000000);  /* Disable texture coordinate wrap-shortest */
-               OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
-                         S4_LINE_WIDTH_ONE |
-                         S4_CULLMODE_NONE |
-                         S4_VFMT_XY);
-               OUT_BATCH(0x00000000);  /* Stencil. */
-               OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
-               OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
-               OUT_BATCH(0);
-               OUT_BATCH(0);
-               OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
-               OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);  /* disable indirect state */
-               OUT_BATCH(0);
-               OUT_BATCH(_3DSTATE_STIPPLE);
-               OUT_BATCH(0x00000000);
-               OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
-       }
-
-       /* samler state */
-       {
-#define TEX_COUNT 1
-               uint32_t tiling_bits = 0;
-               if (src->tiling != I915_TILING_NONE)
-                       tiling_bits = MS3_TILED_SURFACE;
-               if (src->tiling == I915_TILING_Y)
-                       tiling_bits |= MS3_TILE_WALK;
-
-               OUT_BATCH(_3DSTATE_MAP_STATE | (3 * TEX_COUNT));
-               OUT_BATCH((1 << TEX_COUNT) - 1);
-               OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0);
-               OUT_BATCH(MAPSURF_32BIT | MT_32BIT_ARGB8888 |
-                         tiling_bits |
-                         (buf_height(src) - 1) << MS3_HEIGHT_SHIFT |
-                         (buf_width(src) - 1) << MS3_WIDTH_SHIFT);
-               OUT_BATCH((src->stride/4-1) << MS4_PITCH_SHIFT);
-
-               OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT));
-               OUT_BATCH((1 << TEX_COUNT) - 1);
-               OUT_BATCH(MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT |
-                         FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
-                         FILTER_NEAREST << SS2_MIN_FILTER_SHIFT);
-               OUT_BATCH(SS3_NORMALIZED_COORDS |
-                         TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT |
-                         TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT |
-                         0 << SS3_TEXTUREMAP_INDEX_SHIFT);
-               OUT_BATCH(0x00000000);
-       }
-
-       /* render target state */
-       {
-               uint32_t tiling_bits = 0;
-               if (dst->tiling != I915_TILING_NONE)
-                       tiling_bits = BUF_3D_TILED_SURFACE;
-               if (dst->tiling == I915_TILING_Y)
-                       tiling_bits |= BUF_3D_TILE_WALK_Y;
-
-               OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
-               OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling_bits |
-                         BUF_3D_PITCH(dst->stride));
-               OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-
-               OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-               OUT_BATCH(COLR_BUF_ARGB8888 |
-                         DSTORG_HORT_BIAS(0x8) |
-                         DSTORG_VERT_BIAS(0x8));
-
-               /* draw rect is unconditional */
-               OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
-               OUT_BATCH(0x00000000);
-               OUT_BATCH(0x00000000);  /* ymin, xmin */
-               OUT_BATCH(DRAW_YMAX(buf_height(dst) - 1) |
-                         DRAW_XMAX(buf_width(dst) - 1));
-               /* yorig, xorig (relate to color buffer?) */
-               OUT_BATCH(0x00000000);
-       }
-
-       /* texfmt */
-       {
-               uint32_t ss2 = ~0;
-               ss2 &= ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT);
-               ss2 |= S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D);
-               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
-               OUT_BATCH(ss2);
-               OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
-                         BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
-                         BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT |
-                         BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT);
-               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
-                         I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
-               OUT_BATCH(0); /* no vbo */
-               OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) |
-                         (4 << S1_VERTEX_PITCH_SHIFT));
-       }
-
-       /* frage shader */
-       {
-               OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2));
-               /* decl FS_T0 */
-               OUT_BATCH(D0_DCL |
-                         REG_TYPE(FS_T0) << D0_TYPE_SHIFT |
-                         REG_NR(FS_T0) << D0_NR_SHIFT |
-                         ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
-               OUT_BATCH(0);
-               OUT_BATCH(0);
-               /* decl FS_S0 */
-               OUT_BATCH(D0_DCL |
-                         (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) |
-                         (REG_NR(FS_S0) << D0_NR_SHIFT) |
-                         ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
-               OUT_BATCH(0);
-               OUT_BATCH(0);
-               /* texld(FS_OC, FS_S0, FS_T0 */
-               OUT_BATCH(T0_TEXLD |
-                         (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) |
-                         (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) |
-                         (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT));
-               OUT_BATCH((REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) |
-                         (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT));
-               OUT_BATCH(0);
-       }
-
-       OUT_BATCH(PRIM3D_RECTLIST | (3*4 - 1));
-       emit_vertex(dst_x + TILE_SIZE);
-       emit_vertex(dst_y + TILE_SIZE);
-       emit_vertex_normalized(src_x + TILE_SIZE, buf_width(src));
-       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
-
-       emit_vertex(dst_x);
-       emit_vertex(dst_y + TILE_SIZE);
-       emit_vertex_normalized(src_x, buf_width(src));
-       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
-
-       emit_vertex(dst_x);
-       emit_vertex(dst_y);
-       emit_vertex_normalized(src_x, buf_width(src));
-       emit_vertex_normalized(src_y, buf_height(src));
-
-       if (!(keep_gpu_busy_counter & 1) && !fence_storm)
-               keep_gpu_busy();
-
-       keep_gpu_busy_counter++;
-
-       intel_batchbuffer_flush(batch);
-}
-
 static void render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
                            struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
                            unsigned logical_tile_no)
diff --git a/tests/gem_stress.h b/tests/gem_stress.h
new file mode 100644 (file)
index 0000000..53ef8eb
--- /dev/null
@@ -0,0 +1,81 @@
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <getopt.h>
+#include "drm.h"
+#include "i915_drm.h"
+#include "drmtest.h"
+#include "intel_bufmgr.h"
+#include "intel_batchbuffer.h"
+#include "intel_gpu_tools.h"
+
+struct scratch_buf {
+    drm_intel_bo *bo;
+    uint32_t stride;
+    uint32_t tiling;
+    uint32_t *data;
+    unsigned num_tiles;
+};
+
+struct option_struct {
+    unsigned scratch_buf_size;
+    unsigned num_buffers;
+    int trace_tile;
+    int no_hw;
+    int gpu_busy_load;
+    int use_render;
+};
+
+extern struct option_struct options;
+extern drm_intel_bufmgr *bufmgr;
+extern struct intel_batchbuffer *batch;
+extern int drm_fd;
+extern int devid;
+
+extern int fence_storm;
+
+#define MAX_BUFS               4096
+#define SCRATCH_BUF_SIZE       1024*1024
+#define BUSY_BUF_SIZE          (256*4096)
+#define TILE_SIZE              16
+#define TILE_BYTES             (TILE_SIZE*TILE_SIZE*sizeof(uint32_t))
+
+void keep_gpu_busy(void);
+
+static inline void emit_vertex(float f)
+{
+       union { float f; uint32_t ui; } u;
+       u.f = f;
+       OUT_BATCH(u.ui);
+}
+
+static inline void emit_vertex_normalized(float f, float total)
+{
+       union { float f; uint32_t ui; } u;
+       u.f = f / total;
+       OUT_BATCH(u.ui);
+}
+
+static inline unsigned buf_width(struct scratch_buf *buf)
+{
+       return buf->stride/sizeof(uint32_t);
+}
+
+static inline unsigned buf_height(struct scratch_buf *buf)
+{
+       return options.scratch_buf_size/buf->stride;
+}
+
+void gen3_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
+                         struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
+                         unsigned logical_tile_no);
+void gen2_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
+                         struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
+                         unsigned logical_tile_no);
diff --git a/tests/gem_stress_i915.c b/tests/gem_stress_i915.c
new file mode 100644 (file)
index 0000000..66787c7
--- /dev/null
@@ -0,0 +1,197 @@
+#include "i915_reg.h"
+#include "i915_3d.h"
+#include "gem_stress.h"
+
+void gen3_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y,
+                         struct scratch_buf *dst, unsigned dst_x, unsigned dst_y,
+                         unsigned logical_tile_no)
+{
+       static unsigned keep_gpu_busy_counter = 0;
+
+       /* check both edges of the fence usage */
+       if (keep_gpu_busy_counter & 1 && !fence_storm)
+               keep_gpu_busy();
+
+       /* invariant state */
+       {
+               OUT_BATCH(_3DSTATE_AA_CMD |
+                         AA_LINE_ECAAR_WIDTH_ENABLE |
+                         AA_LINE_ECAAR_WIDTH_1_0 |
+                         AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+               OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                         IAB_MODIFY_ENABLE |
+                         IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) |
+                         IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE <<
+                                                  IAB_SRC_FACTOR_SHIFT) |
+                         IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO <<
+                                                  IAB_DST_FACTOR_SHIFT));
+               OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
+                         CSB_TCB(0, 0) |
+                         CSB_TCB(1, 1) |
+                         CSB_TCB(2, 2) |
+                         CSB_TCB(3, 3) |
+                         CSB_TCB(4, 4) |
+                         CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
+               OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
+                         ENABLE_POINT_RASTER_RULE |
+                         OGL_POINT_RASTER_RULE |
+                         ENABLE_LINE_STRIP_PROVOKE_VRTX |
+                         ENABLE_TRI_FAN_PROVOKE_VRTX |
+                         LINE_STRIP_PROVOKE_VRTX(1) |
+                         TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
+               OUT_BATCH(_3DSTATE_MODES_4_CMD |
+                         ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) |
+                         ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
+                         ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
+               OUT_BATCH(0x00000000);  /* Disable texture coordinate wrap-shortest */
+               OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
+                         S4_LINE_WIDTH_ONE |
+                         S4_CULLMODE_NONE |
+                         S4_VFMT_XY);
+               OUT_BATCH(0x00000000);  /* Stencil. */
+               OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
+               OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+               OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);  /* disable indirect state */
+               OUT_BATCH(0);
+               OUT_BATCH(_3DSTATE_STIPPLE);
+               OUT_BATCH(0x00000000);
+               OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
+       }
+
+       /* samler state */
+       {
+#define TEX_COUNT 1
+               uint32_t tiling_bits = 0;
+               if (src->tiling != I915_TILING_NONE)
+                       tiling_bits = MS3_TILED_SURFACE;
+               if (src->tiling == I915_TILING_Y)
+                       tiling_bits |= MS3_TILE_WALK;
+
+               OUT_BATCH(_3DSTATE_MAP_STATE | (3 * TEX_COUNT));
+               OUT_BATCH((1 << TEX_COUNT) - 1);
+               OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0);
+               OUT_BATCH(MAPSURF_32BIT | MT_32BIT_ARGB8888 |
+                         tiling_bits |
+                         (buf_height(src) - 1) << MS3_HEIGHT_SHIFT |
+                         (buf_width(src) - 1) << MS3_WIDTH_SHIFT);
+               OUT_BATCH((src->stride/4-1) << MS4_PITCH_SHIFT);
+
+               OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT));
+               OUT_BATCH((1 << TEX_COUNT) - 1);
+               OUT_BATCH(MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT |
+                         FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
+                         FILTER_NEAREST << SS2_MIN_FILTER_SHIFT);
+               OUT_BATCH(SS3_NORMALIZED_COORDS |
+                         TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT |
+                         TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT |
+                         0 << SS3_TEXTUREMAP_INDEX_SHIFT);
+               OUT_BATCH(0x00000000);
+       }
+
+       /* render target state */
+       {
+               uint32_t tiling_bits = 0;
+               if (dst->tiling != I915_TILING_NONE)
+                       tiling_bits = BUF_3D_TILED_SURFACE;
+               if (dst->tiling == I915_TILING_Y)
+                       tiling_bits |= BUF_3D_TILE_WALK_Y;
+
+               OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+               OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling_bits |
+                         BUF_3D_PITCH(dst->stride));
+               OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
+
+               OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+               OUT_BATCH(COLR_BUF_ARGB8888 |
+                         DSTORG_HORT_BIAS(0x8) |
+                         DSTORG_VERT_BIAS(0x8));
+
+               /* draw rect is unconditional */
+               OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+               OUT_BATCH(0x00000000);
+               OUT_BATCH(0x00000000);  /* ymin, xmin */
+               OUT_BATCH(DRAW_YMAX(buf_height(dst) - 1) |
+                         DRAW_XMAX(buf_width(dst) - 1));
+               /* yorig, xorig (relate to color buffer?) */
+               OUT_BATCH(0x00000000);
+       }
+
+       /* texfmt */
+       {
+               uint32_t ss2 = ~0;
+               ss2 &= ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT);
+               ss2 |= S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D);
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
+               OUT_BATCH(ss2);
+               OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
+                         BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
+                         BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT |
+                         BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT);
+               OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+                         I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
+               OUT_BATCH(0); /* no vbo */
+               OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) |
+                         (4 << S1_VERTEX_PITCH_SHIFT));
+       }
+
+       /* frage shader */
+       {
+               OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2));
+               /* decl FS_T0 */
+               OUT_BATCH(D0_DCL |
+                         REG_TYPE(FS_T0) << D0_TYPE_SHIFT |
+                         REG_NR(FS_T0) << D0_NR_SHIFT |
+                         ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               /* decl FS_S0 */
+               OUT_BATCH(D0_DCL |
+                         (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) |
+                         (REG_NR(FS_S0) << D0_NR_SHIFT) |
+                         ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+               OUT_BATCH(0);
+               OUT_BATCH(0);
+               /* texld(FS_OC, FS_S0, FS_T0 */
+               OUT_BATCH(T0_TEXLD |
+                         (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) |
+                         (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) |
+                         (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT));
+               OUT_BATCH((REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) |
+                         (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT));
+               OUT_BATCH(0);
+       }
+
+       OUT_BATCH(PRIM3D_RECTLIST | (3*4 - 1));
+       emit_vertex(dst_x + TILE_SIZE);
+       emit_vertex(dst_y + TILE_SIZE);
+       emit_vertex_normalized(src_x + TILE_SIZE, buf_width(src));
+       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
+
+       emit_vertex(dst_x);
+       emit_vertex(dst_y + TILE_SIZE);
+       emit_vertex_normalized(src_x, buf_width(src));
+       emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src));
+
+       emit_vertex(dst_x);
+       emit_vertex(dst_y);
+       emit_vertex_normalized(src_x, buf_width(src));
+       emit_vertex_normalized(src_y, buf_height(src));
+
+       if (!(keep_gpu_busy_counter & 1) && !fence_storm)
+               keep_gpu_busy();
+
+       keep_gpu_busy_counter++;
+
+       intel_batchbuffer_flush(batch);
+}
+