llvmpipe: generate two shader varients, one omits triangle in/out testing
authorBrian Paul <brianp@vmware.com>
Fri, 15 Jan 2010 18:21:16 +0000 (11:21 -0700)
committerBrian Paul <brianp@vmware.com>
Fri, 15 Jan 2010 18:21:16 +0000 (11:21 -0700)
When we know that a 4x4 pixel block is entirely inside of a triangle
use the jit function which omits the in/out test code.

Results in a few percent speedup in many tests.

src/gallium/drivers/llvmpipe/lp_rast.c
src/gallium/drivers/llvmpipe/lp_rast.h
src/gallium/drivers/llvmpipe/lp_rast_priv.h
src/gallium/drivers/llvmpipe/lp_rast_tri.c
src/gallium/drivers/llvmpipe/lp_setup.c
src/gallium/drivers/llvmpipe/lp_setup.h
src/gallium/drivers/llvmpipe/lp_state.h
src/gallium/drivers/llvmpipe/lp_state_fs.c

index 75562bf..d03ba17 100644 (file)
@@ -344,9 +344,6 @@ void lp_rast_set_state( struct lp_rasterizer *rast,
 
 
 
-/* Within a tile:
- */
-
 /**
  * Run the shader on all blocks in a tile.  This is used when a tile is
  * completely contained inside a triangle.
@@ -356,8 +353,8 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast,
                          unsigned thread_index,
                          const union lp_rast_cmd_arg arg )
 {
-   /* Set c1,c2,c3 to large values so the in/out test always passes */
-   const int32_t c1 = INT_MIN, c2 = INT_MIN, c3 = INT_MIN;
+   const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
+   struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
    const unsigned tile_x = rast->tasks[thread_index].x;
    const unsigned tile_y = rast->tasks[thread_index].y;
@@ -365,16 +362,35 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast,
 
    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
 
-   /* Use the existing preference for 4x4 (four quads) shading:
-    */
-   for (y = 0; y < TILE_SIZE; y += 4)
-      for (x = 0; x < TILE_SIZE; x += 4)
-         lp_rast_shade_quads( rast,
-                              thread_index,
-                              inputs,
-                              tile_x + x,
-                              tile_y + y,
-                              c1, c2, c3);
+   /* render the whole 64x64 tile in 4x4 chunks */
+   for (y = 0; y < TILE_SIZE; y += 4){
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         uint8_t *color[PIPE_MAX_COLOR_BUFS];
+         uint32_t *depth;
+         unsigned block_offset, i;
+
+         /* offset of the 16x16 pixel block within the tile */
+         block_offset = ((y / 4) * (16 * 16) + (x / 4) * 16);
+
+         /* color buffer */
+         for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+            color[i] = tile->color[i] + 4 * block_offset;
+
+         /* depth buffer */
+         depth = tile->depth + block_offset;
+
+         /* run shader */
+         state->jit_function[0]( &state->jit_context,
+                                 tile_x + x, tile_y + y,
+                                 inputs->a0,
+                                 inputs->dadx,
+                                 inputs->dady,
+                                 color,
+                                 depth,
+                                 INT_MIN, INT_MIN, INT_MIN,
+                                 NULL, NULL, NULL );
+      }
+   }
 }
 
 
@@ -411,7 +427,7 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
    iy = y % TILE_SIZE;
 
    /* offset of the 16x16 pixel block within the tile */
-   block_offset = ((iy/4)*(16*16) + (ix/4)*16);
+   block_offset = ((iy / 4) * (16 * 16) + (ix / 4) * 16);
 
    /* color buffer */
    for (i = 0; i < rast->state.fb.nr_cbufs; i++)
@@ -433,7 +449,7 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
 #endif
 
    /* run shader */
-   state->jit_function( &state->jit_context,
+   state->jit_function[1]( &state->jit_context,
                         x, y,
                         inputs->a0,
                         inputs->dadx,
@@ -445,8 +461,6 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
 }
 
 
-/* End of tile:
- */
 
 
 /**
index d926adb..2a97fe4 100644 (file)
@@ -66,8 +66,10 @@ struct lp_rast_state {
    
    /* The shader itself.  Probably we also need to pass a pointer to
     * the tile color/z/stencil data somehow:
-    */
-   lp_jit_frag_func jit_function;
+    * jit_function[0] skips the triangle in/out test code
+    * jit_function[1] does triangle in/out testing
+     */
+   lp_jit_frag_func jit_function[2];
 
    boolean opaque;
 };
index 5afdeab..607968e 100644 (file)
@@ -30,6 +30,7 @@
 
 #include "pipe/p_thread.h"
 #include "lp_rast.h"
+#include "lp_tile_soa.h"
 
 
 #define MAX_THREADS 8  /* XXX probably temporary here */
@@ -126,4 +127,46 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
                           unsigned x, unsigned y,
                           int32_t c1, int32_t c2, int32_t c3);
 
+
+/**
+ * Shade all pixels in a 4x4 block.  The fragment code omits the
+ * triangle in/out tests.
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE void
+lp_rast_shade_quads_all( struct lp_rasterizer *rast,
+                         unsigned thread_index,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y )
+{
+   const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
+   struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
+   const unsigned ix = x % TILE_SIZE, iy = y % TILE_SIZE;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   void *depth;
+   unsigned block_offset, i;
+
+   /* offset of the containing 16x16 pixel block within the tile */
+   block_offset = (iy / 4) * (16 * 16) + (ix / 4) * 16;
+
+   /* color buffer */
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+      color[i] = tile->color[i] + 4 * block_offset;
+
+   /* depth buffer */
+   depth = tile->depth + block_offset;
+
+   /* run shader */
+   state->jit_function[0]( &state->jit_context,
+                           x, y,
+                           inputs->a0,
+                           inputs->dadx,
+                           inputs->dady,
+                           color,
+                           depth,
+                           INT_MIN, INT_MIN, INT_MIN,
+                           NULL, NULL, NULL );
+}
+
+
 #endif
index bc7397f..9c3f699 100644 (file)
@@ -89,13 +89,10 @@ block_full_4( struct lp_rasterizer_task *rast_task,
               const struct lp_rast_triangle *tri,
               int x, int y )
 {
-   /* Set c1,c2,c3 to large values so the in/out test always passes */
-   const int32_t c1 = INT_MIN, c2 = INT_MIN, c3 = INT_MIN;
-   lp_rast_shade_quads(rast_task->rast,
-                       rast_task->thread_index,
-                       &tri->inputs, 
-                       x, y,
-                       c1, c2, c3);
+   lp_rast_shade_quads_all(rast_task->rast,
+                           rast_task->thread_index,
+                           &tri->inputs, 
+                           x, y);
 }
 
 
index 284337e..355c051 100644 (file)
@@ -362,14 +362,16 @@ lp_setup_set_fs_inputs( struct setup_context *setup,
 }
 
 void
-lp_setup_set_fs_function( struct setup_context *setup,
-                          lp_jit_frag_func jit_function,
-                          boolean opaque )
+lp_setup_set_fs_functions( struct setup_context *setup,
+                           lp_jit_frag_func jit_function0,
+                           lp_jit_frag_func jit_function1,
+                           boolean opaque )
 {
-   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) jit_function);
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) jit_function0);
    /* FIXME: reference count */
 
-   setup->fs.current.jit_function = jit_function;
+   setup->fs.current.jit_function[0] = jit_function0;
+   setup->fs.current.jit_function[1] = jit_function1;
    setup->fs.current.opaque = opaque;
    setup->dirty |= LP_SETUP_NEW_FS;
 }
index c7ef3d3..407f752 100644 (file)
@@ -96,9 +96,10 @@ lp_setup_set_fs_inputs( struct setup_context *setup,
                         unsigned nr );
 
 void
-lp_setup_set_fs_function( struct setup_context *setup,
-                          lp_jit_frag_func jit_function,
-                          boolean opaque );
+lp_setup_set_fs_functions( struct setup_context *setup,
+                           lp_jit_frag_func jit_function0,
+                           lp_jit_frag_func jit_function1,
+                           boolean opaque );
 
 void
 lp_setup_set_fs_constants(struct setup_context *setup,
index ddb152c..224b6e5 100644 (file)
@@ -88,9 +88,9 @@ struct lp_fragment_shader_variant
 
    struct lp_fragment_shader_variant_key key;
 
-   LLVMValueRef function;
+   LLVMValueRef function[2];
 
-   lp_jit_frag_func jit_function;
+   lp_jit_frag_func jit_function[2];
 
    struct lp_fragment_shader_variant *next;
 };
index f15fca2..a8f4a4e 100644 (file)
@@ -349,9 +349,26 @@ generate_scissor_test(LLVMBuilderRef builder,
 }
 
 
+static LLVMValueRef
+build_int32_vec_const(int value)
+{
+   struct lp_type i32_type;
+
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+   return lp_build_int_const_scalar(i32_type, value);
+}
+
+
+
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
  * \param i  which quad in the tile, in range [0,3]
+ * \param do_tri_test  if 1, do triangle edge in/out testing
  */
 static void
 generate_fs(struct llvmpipe_context *lp,
@@ -366,6 +383,7 @@ generate_fs(struct llvmpipe_context *lp,
             LLVMValueRef *pmask,
             LLVMValueRef (*color)[4],
             LLVMValueRef depth_ptr,
+            unsigned do_tri_test,
             LLVMValueRef c0,
             LLVMValueRef c1,
             LLVMValueRef c2,
@@ -411,8 +429,13 @@ generate_fs(struct llvmpipe_context *lp,
    lp_build_flow_scope_declare(flow, &z);
 
    /* do triangle edge testing */
-   generate_tri_edge_mask(builder, i, pmask,
-                          c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+   if (do_tri_test) {
+      generate_tri_edge_mask(builder, i, pmask,
+                             c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+   }
+   else {
+      *pmask = build_int32_vec_const(~0);
+   }
 
    /* 'mask' will control execution based on quad's pixel alive/killed state */
    lp_build_mask_begin(&mask, flow, type, *pmask);
@@ -563,7 +586,8 @@ generate_blend(const struct pipe_blend_state *blend,
 static void
 generate_fragment(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
-                  struct lp_fragment_shader_variant *variant)
+                  struct lp_fragment_shader_variant *variant,
+                  unsigned do_tri_test)
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    const struct lp_fragment_shader_variant_key *key = &variant->key;
@@ -656,7 +680,7 @@ generate_fragment(struct llvmpipe_context *lp,
    function = LLVMAddFunction(screen->module, "shader", func_type);
    LLVMSetFunctionCallConv(function, LLVMCCallConv);
 
-   variant->function = function;
+   variant->function[do_tri_test] = function;
 
 
    /* XXX: need to propagate noalias down into color param now we are
@@ -738,6 +762,7 @@ generate_fragment(struct llvmpipe_context *lp,
                   &fs_mask[i], /* output */
                   out_color,
                   depth_ptr_i,
+                  do_tri_test,
                   c0, c1, c2,
                   step0_ptr, step1_ptr, step2_ptr);
 
@@ -812,10 +837,10 @@ generate_fragment(struct llvmpipe_context *lp,
    /*
     * Translate the LLVM IR into machine code.
     */
-   variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, function);
+   variant->jit_function[do_tri_test] = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, function);
 
    if (LP_DEBUG & DEBUG_ASM)
-      lp_disassemble(variant->jit_function);
+      lp_disassemble(variant->jit_function[do_tri_test]);
 }
 
 
@@ -887,7 +912,8 @@ generate_variant(struct llvmpipe_context *lp,
    variant->shader = shader;
    memcpy(&variant->key, key, sizeof *key);
 
-   generate_fragment(lp, shader, variant);
+   generate_fragment(lp, shader, variant, 0);
+   generate_fragment(lp, shader, variant, 1);
 
    /* insert new variant into linked list */
    variant->next = shader->variants;
@@ -947,11 +973,15 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
    variant = shader->variants;
    while(variant) {
       struct lp_fragment_shader_variant *next = variant->next;
+      unsigned i;
 
-      if(variant->function) {
-         if(variant->jit_function)
-            LLVMFreeMachineCodeForFunction(screen->engine, variant->function);
-         LLVMDeleteFunction(variant->function);
+      for (i = 0; i < Elements(variant->function); i++) {
+         if (variant->function[i]) {
+            if (variant->jit_function[i])
+               LLVMFreeMachineCodeForFunction(screen->engine,
+                                              variant->function[i]);
+            LLVMDeleteFunction(variant->function[i]);
+         }
       }
 
       FREE(variant);
@@ -1093,7 +1123,8 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
             !shader->info.uses_kill
             ? TRUE : FALSE;
 
-   lp_setup_set_fs_function(lp->setup, 
-                            shader->current->jit_function,
-                            opaque);
+   lp_setup_set_fs_functions(lp->setup, 
+                             shader->current->jit_function[0],
+                             shader->current->jit_function[1],
+                             opaque);
 }