From d4b64167b56f780d0dea73193c345622888fbc16 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 17 Jun 2010 21:19:09 +0100 Subject: [PATCH] llvmpipe: pass mask into fragment shader Move this code back out to C for now, will generate separately. Shader now takes a mask parameter instead of C0/C1/C2/etc. Shader does not currently use that parameter and rasterizes whole pixel stamps always. --- src/gallium/drivers/llvmpipe/lp_jit.c | 16 - src/gallium/drivers/llvmpipe/lp_jit.h | 26 +- src/gallium/drivers/llvmpipe/lp_perf.c | 39 +- src/gallium/drivers/llvmpipe/lp_perf.h | 1 + src/gallium/drivers/llvmpipe/lp_rast.c | 126 ++--- src/gallium/drivers/llvmpipe/lp_rast.h | 82 ++-- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 63 ++- src/gallium/drivers/llvmpipe/lp_rast_tri.c | 179 +------ src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 238 +++++++++ src/gallium/drivers/llvmpipe/lp_setup.c | 24 +- src/gallium/drivers/llvmpipe/lp_setup_context.h | 1 - src/gallium/drivers/llvmpipe/lp_setup_tri.c | 609 +++++++++++++++--------- src/gallium/drivers/llvmpipe/lp_state_fs.c | 286 +++-------- src/gallium/drivers/llvmpipe/lp_state_fs.h | 1 - 14 files changed, 901 insertions(+), 790 deletions(-) create mode 100644 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 23aa34d..8e6dfb2 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -103,10 +103,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen) elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatType(); elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = LLVMInt32Type(); elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32Type(); - elem_types[LP_JIT_CTX_SCISSOR_XMIN] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_YMIN] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_XMAX] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_YMAX] = LLVMFloatType(); elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8Type(), 0); elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); @@ -125,18 +121,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen) LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back, screen->target, context_type, LP_JIT_CTX_STENCIL_REF_BACK); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmin, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_XMIN); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymin, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_YMIN); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmax, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_XMAX); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymax, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_YMAX); LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color, screen->target, context_type, LP_JIT_CTX_BLEND_COLOR); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 8d06e65..c941894 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -89,9 +89,6 @@ struct lp_jit_context uint32_t stencil_ref_front, stencil_ref_back; - /** floats, not ints */ - float scissor_xmin, scissor_ymin, scissor_xmax, scissor_ymax; - /* FIXME: store (also?) in floats */ uint8_t *blend_color; @@ -108,10 +105,6 @@ enum { LP_JIT_CTX_ALPHA_REF, LP_JIT_CTX_STENCIL_REF_FRONT, LP_JIT_CTX_STENCIL_REF_BACK, - LP_JIT_CTX_SCISSOR_XMIN, - LP_JIT_CTX_SCISSOR_YMIN, - LP_JIT_CTX_SCISSOR_XMAX, - LP_JIT_CTX_SCISSOR_YMAX, LP_JIT_CTX_BLEND_COLOR, LP_JIT_CTX_TEXTURES, LP_JIT_CTX_COUNT @@ -130,18 +123,6 @@ enum { #define lp_jit_context_stencil_ref_back_value(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back") -#define lp_jit_context_scissor_xmin_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMIN, "scissor_xmin") - -#define lp_jit_context_scissor_ymin_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMIN, "scissor_ymin") - -#define lp_jit_context_scissor_xmax_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMAX, "scissor_xmax") - -#define lp_jit_context_scissor_ymax_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMAX, "scissor_ymax") - #define lp_jit_context_blend_color(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color") @@ -160,12 +141,7 @@ typedef void const void *dady, uint8_t **color, void *depth, - const int32_t c1, - const int32_t c2, - const int32_t c3, - const int32_t *step1, - const int32_t *step2, - const int32_t *step3, + uint32_t mask, uint32_t *counter); diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c index a316597..083e7e3 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.c +++ b/src/gallium/drivers/llvmpipe/lp_perf.c @@ -46,10 +46,10 @@ lp_print_counters(void) { if (LP_DEBUG & DEBUG_COUNTERS) { unsigned total_64, total_16, total_4; - float p1, p2, p3; + float p1, p2, p3, p4; - debug_printf("llvmpipe: nr_triangles: %9u\n", lp_count.nr_tris); - debug_printf("llvmpipe: nr_culled_triangles: %9u\n", lp_count.nr_culled_tris); + debug_printf("llvmpipe: nr_triangles: %9u\n", lp_count.nr_tris); + debug_printf("llvmpipe: nr_culled_triangles: %9u\n", lp_count.nr_culled_tris); total_64 = (lp_count.nr_empty_64 + lp_count.nr_fully_covered_64 + @@ -58,10 +58,13 @@ lp_print_counters(void) p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64; p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64; p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64; + p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64; - debug_printf("llvmpipe: nr_empty_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); - debug_printf("llvmpipe: nr_fully_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); - debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); + debug_printf("llvmpipe: nr_64x64: %9u\n", total_64); + debug_printf("llvmpipe: nr_fully_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); + debug_printf("llvmpipe: nr_shade_opaque_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64); + debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); + debug_printf("llvmpipe: nr_empty_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); total_16 = (lp_count.nr_empty_16 + lp_count.nr_fully_covered_16 + @@ -71,25 +74,27 @@ lp_print_counters(void) p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16; p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16; - debug_printf("llvmpipe: nr_empty_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); - debug_printf("llvmpipe: nr_fully_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); - debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); + debug_printf("llvmpipe: nr_16x16: %9u\n", total_16); + debug_printf("llvmpipe: nr_fully_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); + debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); + debug_printf("llvmpipe: nr_empty_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4); p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4; p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4; - debug_printf("llvmpipe: nr_empty_4x4: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); - debug_printf("llvmpipe: nr_non_empty_4x4: %9u (%2.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); + debug_printf("llvmpipe: nr_4x4: %9u\n", total_4); + debug_printf("llvmpipe: nr_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); + debug_printf("llvmpipe: nr_non_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); - debug_printf("llvmpipe: nr_color_tile_clear: %9u\n", lp_count.nr_color_tile_clear); - debug_printf("llvmpipe: nr_color_tile_load: %9u\n", lp_count.nr_color_tile_load); - debug_printf("llvmpipe: nr_color_tile_store: %9u\n", lp_count.nr_color_tile_store); + debug_printf("llvmpipe: nr_color_tile_clear: %9u\n", lp_count.nr_color_tile_clear); + debug_printf("llvmpipe: nr_color_tile_load: %9u\n", lp_count.nr_color_tile_load); + debug_printf("llvmpipe: nr_color_tile_store: %9u\n", lp_count.nr_color_tile_store); - debug_printf("llvmpipe: nr_llvm_compiles: %u\n", lp_count.nr_llvm_compiles); - debug_printf("llvmpipe: total LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); - debug_printf("llvmpipe: average LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles); + debug_printf("llvmpipe: nr_llvm_compiles: %u\n", lp_count.nr_llvm_compiles); + debug_printf("llvmpipe: total LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); + debug_printf("llvmpipe: average LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles); } } diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h index a9629da..4774f64 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.h +++ b/src/gallium/drivers/llvmpipe/lp_perf.h @@ -44,6 +44,7 @@ struct lp_counters unsigned nr_empty_64; unsigned nr_fully_covered_64; unsigned nr_partially_covered_64; + unsigned nr_shade_opaque_64; unsigned nr_empty_16; unsigned nr_fully_covered_16; unsigned nr_partially_covered_16; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 1dde327..0130e39 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -28,6 +28,7 @@ #include #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_rect.h" #include "util/u_surface.h" #include "lp_scene_queue.h" @@ -136,7 +137,6 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, struct lp_rasterizer *rast = task->rast; struct lp_scene *scene = rast->curr_scene; enum lp_texture_usage usage; - unsigned buf; LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y); @@ -146,24 +146,8 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, task->x = x; task->y = y; - if (scene->has_color_clear) - usage = LP_TEX_USAGE_WRITE_ALL; - else - usage = LP_TEX_USAGE_READ_WRITE; - - /* get pointers to color tile(s) */ - for (buf = 0; buf < rast->state.nr_cbufs; buf++) { - struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; - struct llvmpipe_resource *lpt; - assert(cbuf); - lpt = llvmpipe_resource(cbuf->texture); - task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, - cbuf->face + cbuf->zslice, - cbuf->level, - usage, - x, y); - assert(task->color_tiles[buf]); - } + /* reset pointers to color tile(s) */ + memset(task->color_tiles, 0, sizeof(task->color_tiles)); /* get pointer to depth/stencil tile */ { @@ -222,7 +206,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, clear_color[2] == clear_color[3]) { /* clear to grayscale value {x, x, x, x} */ for (i = 0; i < rast->state.nr_cbufs; i++) { - uint8_t *ptr = task->color_tiles[i]; + uint8_t *ptr = + lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4); } } @@ -234,7 +219,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, */ const unsigned chunk = TILE_SIZE / 4; for (i = 0; i < rast->state.nr_cbufs; i++) { - uint8_t *c = task->color_tiles[i]; + uint8_t *c = + lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); unsigned j; for (j = 0; j < 4 * TILE_SIZE; j++) { @@ -378,8 +364,8 @@ lp_rast_load_color(struct lp_rasterizer_task *task, * This is a bin command which is stored in all bins. */ void -lp_rast_store_color( struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) +lp_rast_store_linear_color( struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) { struct lp_rasterizer *rast = task->rast; struct lp_scene *scene = rast->curr_scene; @@ -448,30 +434,54 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, /* run shader on 4x4 block */ variant->jit_function[RAST_WHOLE]( &state->jit_context, - tile_x + x, tile_y + y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - INT_MIN, INT_MIN, INT_MIN, - NULL, NULL, NULL, &task->vis_counter); + tile_x + x, tile_y + y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + 0xffff, + &task->vis_counter); } } } /** - * Compute shading for a 4x4 block of pixels. + * Run the shader on all blocks in a tile. This is used when a tile is + * completely contained inside a triangle, and the shader is opaque. + * This is a bin command called during bin processing. + */ +void +lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + struct lp_rasterizer *rast = task->rast; + unsigned i; + + LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); + + /* this will prevent converting the layout from tiled to linear */ + for (i = 0; i < rast->state.nr_cbufs; i++) { + (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + } + + lp_rast_shade_tile(task, arg); +} + + +/** + * Compute shading for a 4x4 block of pixels inside a triangle. * This is a bin command called during bin processing. * \param x X position of quad in window coords * \param y Y position of quad in window coords */ -void lp_rast_shade_quads( struct lp_rasterizer_task *task, - const struct lp_rast_shader_inputs *inputs, - unsigned x, unsigned y, - int32_t c1, int32_t c2, int32_t c3) +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, + const struct lp_rast_shader_inputs *inputs, + unsigned x, unsigned y, + unsigned mask) { const struct lp_rast_state *state = task->current_state; struct lp_fragment_shader_variant *variant = state->variant; @@ -501,27 +511,21 @@ void lp_rast_shade_quads( struct lp_rasterizer_task *task, assert(lp_check_alignment(state->jit_context.blend_color, 16)); - assert(lp_check_alignment(inputs->step[0], 16)); - assert(lp_check_alignment(inputs->step[1], 16)); - assert(lp_check_alignment(inputs->step[2], 16)); - /* run shader on 4x4 block */ - variant->jit_function[RAST_EDGE_TEST]( &state->jit_context, - x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - c1, c2, c3, - inputs->step[0], - inputs->step[1], - inputs->step[2], - &task->vis_counter); + variant->jit_function[RAST_EDGE_TEST](&state->jit_context, + x, y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + mask, + &task->vis_counter); } + /** * Set top row and left column of the tile's pixels to white. For debugging. */ @@ -717,10 +721,17 @@ static struct { { RAST(clear_color), RAST(clear_zstencil), - RAST(triangle), + RAST(triangle_1), + RAST(triangle_2), + RAST(triangle_3), + RAST(triangle_4), + RAST(triangle_5), + RAST(triangle_6), + RAST(triangle_7), RAST(shade_tile), + RAST(shade_tile_opaque), RAST(set_state), - RAST(store_color), + RAST(store_linear_color), RAST(fence), RAST(begin_query), RAST(end_query), @@ -775,7 +786,8 @@ is_empty_bin( const struct cmd_bin *bin ) } for (i = 0; i < head->count; i++) - if (head->cmd[i] != lp_rast_set_state) { + if (head->cmd[i] != lp_rast_set_state && + head->cmd[i] != lp_rast_store_linear_color) { return FALSE; } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 80ca68f..ae73e6d 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -83,9 +83,6 @@ struct lp_rast_shader_inputs { float (*a0)[4]; float (*dadx)[4]; float (*dady)[4]; - - /* edge/step info for 3 edges and 4x4 block of pixels */ - PIPE_ALIGN_VAR(16) int step[3][16]; }; struct lp_rast_clearzs { @@ -93,6 +90,22 @@ struct lp_rast_clearzs { unsigned clearzs_mask; }; +struct lp_rast_plane { + /* one-pixel sized trivial accept offsets for each plane */ + int ei; + + /* one-pixel sized trivial reject offsets for each plane */ + int eo; + + /* edge function values at minx,miny ?? */ + int c; + + int dcdx; + int dcdy; + + /* edge/step info for 3 edges and 4x4 block of pixels */ + const int *step; +}; /** * Rasterization information for a triangle known to be in this bin, @@ -101,35 +114,16 @@ struct lp_rast_clearzs { * Objects of this type are put into the lp_setup_context::data buffer. */ struct lp_rast_triangle { + /* inputs for the shader */ + PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs; + + int step[3][16]; + #ifdef DEBUG float v[3][2]; #endif - /* one-pixel sized trivial accept offsets for each plane */ - int ei1; - int ei2; - int ei3; - - /* one-pixel sized trivial reject offsets for each plane */ - int eo1; - int eo2; - int eo3; - - /* y deltas for vertex pairs (in fixed pt) */ - int dy12; - int dy23; - int dy31; - - /* x deltas for vertex pairs (in fixed pt) */ - int dx12; - int dx23; - int dx31; - - /* edge function values at minx,miny ?? */ - int c1, c2, c3; - - /* inputs for the shader */ - PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs; + struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */ }; @@ -153,7 +147,10 @@ lp_rast_finish( struct lp_rasterizer *rast ); union lp_rast_cmd_arg { const struct lp_rast_shader_inputs *shade_tile; - const struct lp_rast_triangle *triangle; + struct { + const struct lp_rast_triangle *tri; + unsigned plane_mask; + } triangle; const struct lp_rast_state *set_state; uint8_t clear_color[4]; const struct lp_rast_clearzs *clear_zstencil; @@ -173,10 +170,12 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile ) } static INLINE union lp_rast_cmd_arg -lp_rast_arg_triangle( const struct lp_rast_triangle *triangle ) +lp_rast_arg_triangle( const struct lp_rast_triangle *triangle, + unsigned plane_mask) { union lp_rast_cmd_arg arg; - arg.triangle = triangle; + arg.triangle.tri = triangle; + arg.triangle.plane_mask = plane_mask; return arg; } @@ -229,16 +228,31 @@ void lp_rast_clear_zstencil( struct lp_rasterizer_task *, void lp_rast_set_state( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); -void lp_rast_triangle( struct lp_rasterizer_task *, - const union lp_rast_cmd_arg ); +void lp_rast_triangle_1( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_2( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_3( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_4( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_5( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_6( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_7( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); void lp_rast_shade_tile( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); +void lp_rast_shade_tile_opaque( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + void lp_rast_fence( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); -void lp_rast_store_color( struct lp_rasterizer_task *, +void lp_rast_store_linear_color( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index eb4175d..024a28b 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -119,10 +119,12 @@ struct lp_rasterizer }; -void lp_rast_shade_quads( struct lp_rasterizer_task *task, - const struct lp_rast_shader_inputs *inputs, - unsigned x, unsigned y, - int32_t c1, int32_t c2, int32_t c3); +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, + const struct lp_rast_shader_inputs *inputs, + unsigned x, unsigned y, + unsigned mask); + /** @@ -158,6 +160,40 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task, /** + * Get pointer to the swizzled color tile + */ +static INLINE uint8_t * +lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, + unsigned buf, enum lp_texture_usage usage) +{ + struct lp_rasterizer *rast = task->rast; + + assert(task->x % TILE_SIZE == 0); + assert(task->y % TILE_SIZE == 0); + assert(buf < rast->state.nr_cbufs); + + if (!task->color_tiles[buf]) { + struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; + struct llvmpipe_resource *lpt; + assert(cbuf); + lpt = llvmpipe_resource(cbuf->texture); + task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, + cbuf->face + cbuf->zslice, + cbuf->level, + usage, + task->x, + task->y); + if (!task->color_tiles[buf]) { + /* out of memory - use dummy tile memory */ + return lp_get_dummy_tile(); + } + } + + return task->color_tiles[buf]; +} + + +/** * Get the pointer to a 4x4 color block (within a 64x64 tile). * We'll map the color buffer on demand here. * Note that this may be called even when there's no color buffers - return @@ -174,6 +210,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, assert((x % TILE_VECTOR_WIDTH) == 0); assert((y % TILE_VECTOR_HEIGHT) == 0); + color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE); color = task->color_tiles[buf]; if (!color) { /* out of memory - use dummy tile memory */ @@ -217,15 +254,15 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, /* run shader on 4x4 block */ variant->jit_function[RAST_WHOLE]( &state->jit_context, - x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - INT_MIN, INT_MIN, INT_MIN, - NULL, NULL, NULL, &task->vis_counter ); + x, y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + 0xffff, + &task->vis_counter ); } diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index a5f0d14..ebe9a8e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -113,168 +113,31 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } +#define TAG(x) x##_1 +#define NR_PLANES 1 +#include "lp_rast_tri_tmp.h" -/** - * Pass the 4x4 pixel block to the shader function. - * Determination of which of the 16 pixels lies inside the triangle - * will be done as part of the fragment shader. - */ -static void -do_block_4(struct lp_rasterizer_task *task, - const struct lp_rast_triangle *tri, - int x, int y, - int c1, int c2, int c3) -{ - assert(x >= 0); - assert(y >= 0); - - lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3); -} - - -/** - * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out - * of the triangle's bounds. - */ -static void -do_block_16(struct lp_rasterizer_task *task, - const struct lp_rast_triangle *tri, - int x, int y, - int c0, int c1, int c2) -{ - unsigned mask = 0; - int eo[3]; - int c[3]; - int i, j; - - assert(x >= 0); - assert(y >= 0); - assert(x % 16 == 0); - assert(y % 16 == 0); - - eo[0] = tri->eo1 * 4; - eo[1] = tri->eo2 * 4; - eo[2] = tri->eo3 * 4; - - c[0] = c0; - c[1] = c1; - c[2] = c2; - - for (j = 0; j < 3; j++) { - const int *step = tri->inputs.step[j]; - const int cx = c[j] + eo[j]; - - /* Mask has bits set whenever we are outside any of the edges. - */ - for (i = 0; i < 16; i++) { - int out = cx + step[i] * 4; - mask |= (out >> 31) & (1 << i); - } - } +#define TAG(x) x##_2 +#define NR_PLANES 2 +#include "lp_rast_tri_tmp.h" - mask = ~mask & 0xffff; - while (mask) { - int i = ffs(mask) - 1; - int px = x + pos_table4[i][0]; - int py = y + pos_table4[i][1]; - int cx1 = c0 + tri->inputs.step[0][i] * 4; - int cx2 = c1 + tri->inputs.step[1][i] * 4; - int cx3 = c2 + tri->inputs.step[2][i] * 4; +#define TAG(x) x##_3 +#define NR_PLANES 3 +#include "lp_rast_tri_tmp.h" - mask &= ~(1 << i); +#define TAG(x) x##_4 +#define NR_PLANES 4 +#include "lp_rast_tri_tmp.h" - /* Don't bother testing if the 4x4 block is entirely in/out of - * the triangle. It's a little faster to do it in the jit code. - */ - LP_COUNT(nr_non_empty_4); - do_block_4(task, tri, px, py, cx1, cx2, cx3); - } -} - - -/** - * Scan the tile in chunks and figure out which pixels to rasterize - * for this triangle. - */ -void -lp_rast_triangle(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - const struct lp_rast_triangle *tri = arg.triangle; - const int x = task->x, y = task->y; - int ei[3], eo[3], c[3]; - unsigned outmask, inmask, partial_mask; - unsigned i, j; - - c[0] = tri->c1 + tri->dx12 * y - tri->dy12 * x; - c[1] = tri->c2 + tri->dx23 * y - tri->dy23 * x; - c[2] = tri->c3 + tri->dx31 * y - tri->dy31 * x; - - eo[0] = tri->eo1 * 16; - eo[1] = tri->eo2 * 16; - eo[2] = tri->eo3 * 16; - - ei[0] = tri->ei1 * 16; - ei[1] = tri->ei2 * 16; - ei[2] = tri->ei3 * 16; - - outmask = 0; - inmask = 0xffff; +#define TAG(x) x##_5 +#define NR_PLANES 5 +#include "lp_rast_tri_tmp.h" - for (j = 0; j < 3; j++) { - const int *step = tri->inputs.step[j]; - const int cox = c[j] + eo[j]; - const int cio = ei[j]- eo[j]; +#define TAG(x) x##_6 +#define NR_PLANES 6 +#include "lp_rast_tri_tmp.h" - /* Outmask has bits set whenever we are outside any of the - * edges. - */ - /* Inmask has bits set whenever we are inside all of the edges. - */ - for (i = 0; i < 16; i++) { - int out = cox + step[i] * 16; - int in = out + cio; - outmask |= (out >> 31) & (1 << i); - inmask &= ~((in >> 31) & (1 << i)); - } - } +#define TAG(x) x##_7 +#define NR_PLANES 7 +#include "lp_rast_tri_tmp.h" - assert((outmask & inmask) == 0); - - if (outmask == 0xffff) - return; - - /* Invert mask, so that bits are set whenever we are at least - * partially inside all of the edges: - */ - partial_mask = ~inmask & ~outmask & 0xffff; - - /* Iterate over partials: - */ - while (partial_mask) { - int i = ffs(partial_mask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; - int cx1 = c[0] + tri->inputs.step[0][i] * 16; - int cx2 = c[1] + tri->inputs.step[1][i] * 16; - int cx3 = c[2] + tri->inputs.step[2][i] * 16; - - partial_mask &= ~(1 << i); - - LP_COUNT(nr_partially_covered_16); - do_block_16(task, tri, px, py, cx1, cx2, cx3); - } - - /* Iterate over fulls: - */ - while (inmask) { - int i = ffs(inmask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; - - inmask &= ~(1 << i); - - LP_COUNT(nr_fully_covered_16); - block_full_16(task, tri, px, py); - } -} diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h new file mode 100644 index 0000000..a410c61 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -0,0 +1,238 @@ +/************************************************************************** + * + * Copyright 2007-2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Rasterization for binned triangles within a tile + */ + + + +/** + * Prototype for a 7 plane rasterizer function. Will codegenerate + * several of these. + * + * XXX: Varients for more/fewer planes. + * XXX: Need ways of dropping planes as we descend. + * XXX: SIMD + */ +static void +TAG(do_block_4)(struct lp_rasterizer_task *task, + const struct lp_rast_triangle *tri, + const struct lp_rast_plane *plane, + int x, int y, + const int *c) +{ + unsigned mask = 0; + int i; + + for (i = 0; i < 16; i++) { + int any_negative = 0; + int j; + + for (j = 0; j < NR_PLANES; j++) + any_negative |= (c[j] - 1 + plane[j].step[i]); + + any_negative >>= 31; + + mask |= (~any_negative) & (1 << i); + } + + /* Now pass to the shader: + */ + if (mask) + lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); +} + +/** + * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out + * of the triangle's bounds. + */ +static void +TAG(do_block_16)(struct lp_rasterizer_task *task, + const struct lp_rast_triangle *tri, + const struct lp_rast_plane *plane, + int x, int y, + const int *c) +{ + unsigned outmask, inmask, partmask, partial_mask; + unsigned i, j; + + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < NR_PLANES; j++) { + const int *step = plane[j].step; + const int eo = plane[j].eo * 4; + const int ei = plane[j].ei * 4; + const int cox = c[j] + eo; + const int cio = ei - 1 - eo; + + for (i = 0; i < 16; i++) { + int out = cox + step[i] * 4; + int part = out + cio; + outmask |= (out >> 31) & (1 << i); + partmask |= (part >> 31) & (1 << i); + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + int cx[NR_PLANES]; + + for (j = 0; j < NR_PLANES; j++) + cx[j] = c[j] + plane[j].step[i] * 4; + + partial_mask &= ~(1 << i); + + TAG(do_block_4)(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + + inmask &= ~(1 << i); + + block_full_4(task, tri, px, py); + } +} + + +/** + * Scan the tile in chunks and figure out which pixels to rasterize + * for this triangle. + */ +void +TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + unsigned plane_mask = arg.triangle.plane_mask; + const int x = task->x, y = task->y; + struct lp_rast_plane plane[NR_PLANES]; + int c[NR_PLANES]; + unsigned outmask, inmask, partmask, partial_mask; + unsigned i, j, nr_planes = 0; + + while (plane_mask) { + int i = ffs(plane_mask) - 1; + plane[nr_planes] = tri->plane[i]; + plane_mask &= ~(1 << i); + nr_planes++; + }; + + assert(nr_planes == NR_PLANES); + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < NR_PLANES; j++) { + const int *step = plane[j].step; + const int eo = plane[j].eo * 16; + const int ei = plane[j].ei * 16; + int cox, cio; + + c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; + cox = c[j] + eo; + cio = ei - 1 - eo; + + for (i = 0; i < 16; i++) { + int out = cox + step[i] * 16; + int part = out + cio; + outmask |= (out >> 31) & (1 << i); + partmask |= (part >> 31) & (1 << i); + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int px = x + pos_table16[i][0]; + int py = y + pos_table16[i][1]; + int cx[NR_PLANES]; + + for (j = 0; j < NR_PLANES; j++) + cx[j] = c[j] + plane[j].step[i] * 16; + + partial_mask &= ~(1 << i); + + LP_COUNT(nr_partially_covered_16); + TAG(do_block_16)(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int px = x + pos_table16[i][0]; + int py = y + pos_table16[i][1]; + + inmask &= ~(1 << i); + + LP_COUNT(nr_fully_covered_16); + block_full_16(task, tri, px, py); + } +} + +#undef TAG +#undef NR_PLANES + diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 3b83f4e..40959e6 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -287,7 +287,7 @@ lp_setup_flush( struct lp_setup_context *setup, * data to linear in the texture_unmap() function, which will * not be a parallel/threaded operation as here. */ - lp_scene_bin_everywhere(scene, lp_rast_store_color, dummy); + lp_scene_bin_everywhere(scene, lp_rast_store_linear_color, dummy); } @@ -752,28 +752,6 @@ lp_setup_update_state( struct lp_setup_context *setup ) setup->dirty |= LP_SETUP_NEW_FS; } - if (setup->dirty & LP_SETUP_NEW_SCISSOR) { - float *stored; - - stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16); - - if (stored) { - stored[0] = (float) setup->scissor.current.minx; - stored[1] = (float) setup->scissor.current.miny; - stored[2] = (float) setup->scissor.current.maxx; - stored[3] = (float) setup->scissor.current.maxy; - - setup->scissor.stored = stored; - - setup->fs.current.jit_context.scissor_xmin = stored[0]; - setup->fs.current.jit_context.scissor_ymin = stored[1]; - setup->fs.current.jit_context.scissor_xmax = stored[2]; - setup->fs.current.jit_context.scissor_ymax = stored[3]; - } - - setup->dirty |= LP_SETUP_NEW_FS; - } - if(setup->dirty & LP_SETUP_NEW_CONSTANTS) { struct pipe_resource *buffer = setup->constants.current; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 8f4e00f..0cea779 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -130,7 +130,6 @@ struct lp_setup_context struct { struct pipe_scissor_state current; - const void *stored; } scissor; unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 4e2e17f..036b549 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -38,12 +38,78 @@ #define NUM_CHANNELS 4 +struct tri_info { + + float pixel_offset; + + /* fixed point vertex coordinates */ + int x[3]; + int y[3]; + + /* float x,y deltas - all from the original coordinates + */ + float dy01, dy20; + float dx01, dx20; + float oneoverarea; + + const float (*v0)[4]; + const float (*v1)[4]; + const float (*v2)[4]; + + boolean frontfacing; +}; + + + +static const int step_scissor_minx[16] = { + 0, 1, 0, 1, + 2, 3, 2, 3, + 0, 1, 0, 1, + 2, 3, 2, 3 +}; + +static const int step_scissor_maxx[16] = { + 0, -1, 0, -1, + -2, -3, -2, -3, + 0, -1, 0, -1, + -2, -3, -2, -3 +}; + +static const int step_scissor_miny[16] = { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3 +}; + +static const int step_scissor_maxy[16] = { + 0, 0, -1, -1, + 0, 0, -1, -1, + -2, -2, -3, -3, + -2, -2, -3, -3 +}; + + + + +static INLINE int +subpixel_snap(float a) +{ + return util_iround(FIXED_ONE * a); +} + +static INLINE float +fixed_to_float(int a) +{ + return a * (1.0 / FIXED_ONE); +} + + /** * Compute a0 for a constant-valued coefficient (GL_FLAT shading). */ -static void constant_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, +static void constant_coef( struct lp_rast_triangle *tri, unsigned slot, const float value, unsigned i ) @@ -54,28 +120,21 @@ static void constant_coef( struct lp_setup_context *setup, } -/** - * Compute a0, dadx and dady for a linearly interpolated coefficient, - * for a triangle. - */ -static void linear_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, + +static void linear_coef( struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned vert_attr, unsigned i) { - float a1 = v1[vert_attr][i]; - float a2 = v2[vert_attr][i]; - float a3 = v3[vert_attr][i]; + float a0 = info->v0[vert_attr][i]; + float a1 = info->v1[vert_attr][i]; + float a2 = info->v2[vert_attr][i]; - float da12 = a1 - a2; - float da31 = a3 - a1; - float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; - float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; + float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; tri->inputs.dadx[slot][i] = dadx; tri->inputs.dady[slot][i] = dady; @@ -92,9 +151,9 @@ static void linear_coef( struct lp_setup_context *setup, * to define a0 as the sample at a pixel center somewhere near vmin * instead - i'll switch to this later. */ - tri->inputs.a0[slot][i] = (a1 - - (dadx * (v1[0][0] - setup->pixel_offset) + - dady * (v1[0][1] - setup->pixel_offset))); + tri->inputs.a0[slot][i] = (a0 - + (dadx * (info->v0[0][0] - info->pixel_offset) + + dady * (info->v0[0][1] - info->pixel_offset))); } @@ -106,31 +165,27 @@ static void linear_coef( struct lp_setup_context *setup, * Later, when we compute the value at a particular fragment position we'll * divide the interpolated value by the interpolated W at that fragment. */ -static void perspective_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, +static void perspective_coef( struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned vert_attr, unsigned i) { /* premultiply by 1/w (v[0][3] is always 1/w): */ - float a1 = v1[vert_attr][i] * v1[0][3]; - float a2 = v2[vert_attr][i] * v2[0][3]; - float a3 = v3[vert_attr][i] * v3[0][3]; - float da12 = a1 - a2; - float da31 = a3 - a1; - float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; - float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; + float a0 = info->v0[vert_attr][i] * info->v0[0][3]; + float a1 = info->v1[vert_attr][i] * info->v1[0][3]; + float a2 = info->v2[vert_attr][i] * info->v2[0][3]; + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; + float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; tri->inputs.dadx[slot][i] = dadx; tri->inputs.dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a1 - - (dadx * (v1[0][0] - setup->pixel_offset) + - dady * (v1[0][1] - setup->pixel_offset))); + tri->inputs.a0[slot][i] = (a0 - + (dadx * (info->v0[0][0] - info->pixel_offset) + + dady * (info->v0[0][1] - info->pixel_offset))); } @@ -141,13 +196,9 @@ static void perspective_coef( struct lp_setup_context *setup, * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. */ static void -setup_fragcoord_coef(struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, +setup_fragcoord_coef(struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned usage_mask) { /*X*/ @@ -166,12 +217,12 @@ setup_fragcoord_coef(struct lp_setup_context *setup, /*Z*/ if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 2); + linear_coef(tri, info, slot, 0, 2); } /*W*/ if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 3); + linear_coef(tri, info, slot, 0, 3); } } @@ -180,24 +231,23 @@ setup_fragcoord_coef(struct lp_setup_context *setup, * Setup the fragment input attribute with the front-facing value. * \param frontface is the triangle front facing? */ -static void setup_facing_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, +static void setup_facing_coef( struct lp_rast_triangle *tri, unsigned slot, boolean frontface, unsigned usage_mask) { /* convert TRUE to 1.0 and FALSE to -1.0 */ if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( setup, tri, slot, 2.0f * frontface - 1.0f, 0 ); + constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 ); if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( setup, tri, slot, 0.0f, 1 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 1 ); /* wasted */ if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( setup, tri, slot, 0.0f, 2 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 2 ); /* wasted */ if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( setup, tri, slot, 0.0f, 3 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 3 ); /* wasted */ } @@ -206,11 +256,7 @@ static void setup_facing_coef( struct lp_setup_context *setup, */ static void setup_tri_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *tri, - float oneoverarea, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], - boolean frontface) + const struct tri_info *info) { unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; @@ -227,25 +273,25 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, if (setup->flatshade_first) { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, v1[vert_attr][i], i); + constant_coef(tri, slot+1, info->v0[vert_attr][i], i); } else { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, v3[vert_attr][i], i); + constant_coef(tri, slot+1, info->v2[vert_attr][i], i); } break; case LP_INTERP_LINEAR: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - linear_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); + linear_coef(tri, info, slot+1, vert_attr, i); break; case LP_INTERP_PERSPECTIVE: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - perspective_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); + perspective_coef(tri, info, slot+1, vert_attr, i); fragcoord_usage_mask |= TGSI_WRITEMASK_W; break; @@ -259,7 +305,7 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, break; case LP_INTERP_FACING: - setup_facing_coef(setup, tri, slot+1, frontface, usage_mask); + setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask); break; default: @@ -269,16 +315,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, /* The internal position input is in slot zero: */ - setup_fragcoord_coef(setup, tri, oneoverarea, 0, v1, v2, v3, - fragcoord_usage_mask); + setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask); } -static INLINE int subpixel_snap( float a ) -{ - return util_iround(FIXED_ONE * a - (FIXED_ONE / 2)); -} @@ -291,21 +332,25 @@ static INLINE int subpixel_snap( float a ) * \return pointer to triangle space */ static INLINE struct lp_rast_triangle * -alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size) +alloc_triangle(struct lp_scene *scene, + unsigned nr_inputs, + unsigned nr_planes, + unsigned *tri_size) { unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); struct lp_rast_triangle *tri; - unsigned bytes; + unsigned tri_bytes, bytes; char *inputs; assert(sizeof(*tri) % 16 == 0); - bytes = sizeof(*tri) + (3 * input_array_sz); + tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16); + bytes = tri_bytes + (3 * input_array_sz); tri = lp_scene_alloc_aligned( scene, bytes, 16 ); if (tri) { - inputs = (char *) (tri + 1); + inputs = ((char *)tri) + tri_bytes; tri->inputs.a0 = (float (*)[4]) inputs; tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz); tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz); @@ -329,52 +374,71 @@ print_triangle(struct lp_setup_context *setup, uint i; debug_printf("llvmpipe triangle\n"); - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v1[%d]: %f %f %f %f\n", i, v1[i][0], v1[i][1], v1[i][2], v1[i][3]); } - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v2[%d]: %f %f %f %f\n", i, v2[i][0], v2[i][1], v2[i][2], v2[i][3]); } - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v3[%d]: %f %f %f %f\n", i, v3[i][0], v3[i][1], v3[i][2], v3[i][3]); } } +lp_rast_cmd lp_rast_tri_tab[8] = { + NULL, /* should be impossible */ + lp_rast_triangle_1, + lp_rast_triangle_2, + lp_rast_triangle_3, + lp_rast_triangle_4, + lp_rast_triangle_5, + lp_rast_triangle_6, + lp_rast_triangle_7 +}; + /** * Do basic setup for triangle rasterization and determine which * framebuffer tiles are touched. Put the triangle in the scene's * bins for the tiles which we overlap. */ -static void +static void do_triangle_ccw(struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4], const float (*v3)[4], boolean frontfacing ) { - /* x/y positions in fixed point */ - const int x1 = subpixel_snap(v1[0][0] + 0.5 - setup->pixel_offset); - const int x2 = subpixel_snap(v2[0][0] + 0.5 - setup->pixel_offset); - const int x3 = subpixel_snap(v3[0][0] + 0.5 - setup->pixel_offset); - const int y1 = subpixel_snap(v1[0][1] + 0.5 - setup->pixel_offset); - const int y2 = subpixel_snap(v2[0][1] + 0.5 - setup->pixel_offset); - const int y3 = subpixel_snap(v3[0][1] + 0.5 - setup->pixel_offset); struct lp_scene *scene = lp_setup_get_current_scene(setup); + struct lp_fragment_shader_variant *variant = setup->fs.current.variant; struct lp_rast_triangle *tri; + struct tri_info info; int area; - float oneoverarea; int minx, maxx, miny, maxy; + int ix0, ix1, iy0, iy1; unsigned tri_bytes; - + int i; + int nr_planes = 3; + if (0) print_triangle(setup, v1, v2, v3); - tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes); + if (setup->scissor_test) { + nr_planes = 7; + } + else { + nr_planes = 3; + } + + + tri = alloc_triangle(scene, + setup->fs.nr_inputs, + nr_planes, + &tri_bytes); if (!tri) return; @@ -387,15 +451,24 @@ do_triangle_ccw(struct lp_setup_context *setup, tri->v[2][1] = v3[0][1]; #endif - tri->dx12 = x1 - x2; - tri->dx23 = x2 - x3; - tri->dx31 = x3 - x1; + /* x/y positions in fixed point */ + info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset); + info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset); + info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset); + info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset); + info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset); + info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset); + + tri->plane[0].dcdy = info.x[0] - info.x[1]; + tri->plane[1].dcdy = info.x[1] - info.x[2]; + tri->plane[2].dcdy = info.x[2] - info.x[0]; - tri->dy12 = y1 - y2; - tri->dy23 = y2 - y3; - tri->dy31 = y3 - y1; + tri->plane[0].dcdx = info.y[0] - info.y[1]; + tri->plane[1].dcdx = info.y[1] - info.y[2]; + tri->plane[2].dcdx = info.y[2] - info.y[0]; - area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12); + area = (tri->plane[0].dcdy * tri->plane[2].dcdx - + tri->plane[2].dcdy * tri->plane[0].dcdx); LP_COUNT(nr_tris); @@ -410,20 +483,35 @@ do_triangle_ccw(struct lp_setup_context *setup, } /* Bounding rectangle (in pixels) */ - minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; - miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; - + { + /* Yes this is necessary to accurately calculate bounding boxes + * with the two fill-conventions we support. GL (normally) ends + * up needing a bottom-left fill convention, which requires + * slightly different rounding. + */ + int adj = (setup->pixel_offset != 0) ? 1 : 0; + + minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + } + if (setup->scissor_test) { minx = MAX2(minx, setup->scissor.current.minx); maxx = MIN2(maxx, setup->scissor.current.maxx); miny = MAX2(miny, setup->scissor.current.miny); maxy = MIN2(maxy, setup->scissor.current.maxy); } + else { + minx = MAX2(minx, 0); + miny = MAX2(miny, 0); + maxx = MIN2(maxx, scene->fb.width); + maxy = MIN2(maxy, scene->fb.height); + } + - if (miny == maxy || - minx == maxx) { + if (miny >= maxy || minx >= maxx) { lp_scene_putback_data( scene, tri_bytes ); LP_COUNT(nr_culled_tris); return; @@ -431,75 +519,87 @@ do_triangle_ccw(struct lp_setup_context *setup, /* */ - oneoverarea = ((float)FIXED_ONE) / (float)area; + info.pixel_offset = setup->pixel_offset; + info.v0 = v1; + info.v1 = v2; + info.v2 = v3; + info.dx01 = info.v0[0][0] - info.v1[0][0]; + info.dx20 = info.v2[0][0] - info.v0[0][0]; + info.dy01 = info.v0[0][1] - info.v1[0][1]; + info.dy20 = info.v2[0][1] - info.v0[0][1]; + info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01); + info.frontfacing = frontfacing; /* Setup parameter interpolants: */ - setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing ); + setup_tri_coefficients( setup, tri, &info ); tri->inputs.facing = frontfacing ? 1.0F : -1.0F; - /* half-edge constants, will be interated over the whole render target. - */ - tri->c1 = tri->dy12 * x1 - tri->dx12 * y1; - tri->c2 = tri->dy23 * x2 - tri->dx23 * y2; - tri->c3 = tri->dy31 * x3 - tri->dx31 * y3; - /* correct for top-left fill convention: - */ - if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++; - if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++; - if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++; - - tri->dy12 *= FIXED_ONE; - tri->dy23 *= FIXED_ONE; - tri->dy31 *= FIXED_ONE; - - tri->dx12 *= FIXED_ONE; - tri->dx23 *= FIXED_ONE; - tri->dx31 *= FIXED_ONE; - - /* find trivial reject offsets for each edge for a single-pixel - * sized block. These will be scaled up at each recursive level to - * match the active blocksize. Scaling in this way works best if - * the blocks are square. - */ - tri->eo1 = 0; - if (tri->dy12 < 0) tri->eo1 -= tri->dy12; - if (tri->dx12 > 0) tri->eo1 += tri->dx12; + + for (i = 0; i < 3; i++) { + struct lp_rast_plane *plane = &tri->plane[i]; - tri->eo2 = 0; - if (tri->dy23 < 0) tri->eo2 -= tri->dy23; - if (tri->dx23 > 0) tri->eo2 += tri->dx23; + /* half-edge constants, will be interated over the whole render + * target. + */ + plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i]; + + /* correct for top-left vs. bottom-left fill convention. + * + * note that we're overloading gl_rasterization_rules to mean + * both (0.5,0.5) pixel centers *and* bottom-left filling + * convention. + * + * GL actually has a top-left filling convention, but GL's + * notion of "top" differs from gallium's... + * + * Also, sometimes (in FBO cases) GL will render upside down + * to its usual method, in which case it will probably want + * to use the opposite, top-left convention. + */ + if (plane->dcdx < 0) { + /* both fill conventions want this - adjust for left edges */ + plane->c++; + } + else if (plane->dcdx == 0) { + if (setup->pixel_offset == 0) { + /* correct for top-left fill convention: + */ + if (plane->dcdy > 0) plane->c++; + } + else { + /* correct for bottom-left fill convention: + */ + if (plane->dcdy < 0) plane->c++; + } + } - tri->eo3 = 0; - if (tri->dy31 < 0) tri->eo3 -= tri->dy31; - if (tri->dx31 > 0) tri->eo3 += tri->dx31; + plane->dcdx *= FIXED_ONE; + plane->dcdy *= FIXED_ONE; - /* Calculate trivial accept offsets from the above. - */ - tri->ei1 = tri->dx12 - tri->dy12 - tri->eo1; - tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2; - tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3; + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + plane->eo = 0; + if (plane->dcdx < 0) plane->eo -= plane->dcdx; + if (plane->dcdy > 0) plane->eo += plane->dcdy; - /* Fill in the inputs.step[][] arrays. - * We've manually unrolled some loops here. - */ - { - const int xstep1 = -tri->dy12; - const int xstep2 = -tri->dy23; - const int xstep3 = -tri->dy31; - const int ystep1 = tri->dx12; - const int ystep2 = tri->dx23; - const int ystep3 = tri->dx31; - -#define SETUP_STEP(i, x, y) \ - do { \ - tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \ - tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \ - tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \ - } while (0) + /* Calculate trivial accept offsets from the above. + */ + plane->ei = plane->dcdy - plane->dcdx - plane->eo; + plane->step = tri->step[i]; + + /* Fill in the inputs.step[][] arrays. + * We've manually unrolled some loops here. + */ +#define SETUP_STEP(j, x, y) \ + tri->step[i][j] = y * plane->dcdy - x * plane->dcdx + SETUP_STEP(0, 0, 0); SETUP_STEP(1, 1, 0); SETUP_STEP(2, 0, 1); @@ -522,63 +622,106 @@ do_triangle_ccw(struct lp_setup_context *setup, #undef STEP } + + /* + * When rasterizing scissored tris, use the intersection of the + * triangle bounding box and the scissor rect to generate the + * scissor planes. + * + * This permits us to cut off the triangle "tails" that are present + * in the intermediate recursive levels caused when two of the + * triangles edges don't diverge quickly enough to trivially reject + * exterior blocks from the triangle. + * + * It's not really clear if it's worth worrying about these tails, + * but since we generate the planes for each scissored tri, it's + * free to trim them in this case. + * + * Note that otherwise, the scissor planes only vary in 'C' value, + * and even then only on state-changes. Could alternatively store + * these planes elsewhere. + */ + if (nr_planes == 7) { + tri->plane[3].step = step_scissor_minx; + tri->plane[3].dcdx = -1; + tri->plane[3].dcdy = 0; + tri->plane[3].c = 1-minx; + tri->plane[3].ei = 0; + tri->plane[3].eo = 1; + + tri->plane[4].step = step_scissor_maxx; + tri->plane[4].dcdx = 1; + tri->plane[4].dcdy = 0; + tri->plane[4].c = maxx; + tri->plane[4].ei = -1; + tri->plane[4].eo = 0; + + tri->plane[5].step = step_scissor_miny; + tri->plane[5].dcdx = 0; + tri->plane[5].dcdy = 1; + tri->plane[5].c = 1-miny; + tri->plane[5].ei = 0; + tri->plane[5].eo = 1; + + tri->plane[6].step = step_scissor_maxy; + tri->plane[6].dcdx = 0; + tri->plane[6].dcdy = -1; + tri->plane[6].c = maxy; + tri->plane[6].ei = -1; + tri->plane[6].eo = 0; + } + + /* * All fields of 'tri' are now set. The remaining code here is * concerned with binning. */ - /* Convert to tile coordinates: + /* Convert to tile coordinates, and inclusive ranges: */ - minx = minx / TILE_SIZE; - miny = miny / TILE_SIZE; - maxx = maxx / TILE_SIZE; - maxy = maxy / TILE_SIZE; + ix0 = minx / TILE_SIZE; + iy0 = miny / TILE_SIZE; + ix1 = (maxx-1) / TILE_SIZE; + iy1 = (maxy-1) / TILE_SIZE; /* * Clamp to framebuffer size */ - minx = MAX2(minx, 0); - miny = MAX2(miny, 0); - maxx = MIN2(maxx, scene->tiles_x - 1); - maxy = MIN2(maxy, scene->tiles_y - 1); + assert(ix0 == MAX2(ix0, 0)); + assert(iy0 == MAX2(iy0, 0)); + assert(ix1 == MIN2(ix1, scene->tiles_x - 1)); + assert(iy1 == MIN2(iy1, scene->tiles_y - 1)); /* Determine which tile(s) intersect the triangle's bounding box */ - if (miny == maxy && minx == maxx) + if (iy0 == iy1 && ix0 == ix1) { /* Triangle is contained in a single tile: */ - lp_scene_bin_command( scene, minx, miny, lp_rast_triangle, - lp_rast_arg_triangle(tri) ); + lp_scene_bin_command( scene, ix0, iy0, + lp_rast_tri_tab[nr_planes], + lp_rast_arg_triangle(tri, (1<c1 + - tri->dx12 * miny * TILE_SIZE - - tri->dy12 * minx * TILE_SIZE); - int c2 = (tri->c2 + - tri->dx23 * miny * TILE_SIZE - - tri->dy23 * minx * TILE_SIZE); - int c3 = (tri->c3 + - tri->dx31 * miny * TILE_SIZE - - tri->dy31 * minx * TILE_SIZE); - - int ei1 = tri->ei1 << TILE_ORDER; - int ei2 = tri->ei2 << TILE_ORDER; - int ei3 = tri->ei3 << TILE_ORDER; - - int eo1 = tri->eo1 << TILE_ORDER; - int eo2 = tri->eo2 << TILE_ORDER; - int eo3 = tri->eo3 << TILE_ORDER; - - int xstep1 = -(tri->dy12 << TILE_ORDER); - int xstep2 = -(tri->dy23 << TILE_ORDER); - int xstep3 = -(tri->dy31 << TILE_ORDER); - - int ystep1 = tri->dx12 << TILE_ORDER; - int ystep2 = tri->dx23 << TILE_ORDER; - int ystep3 = tri->dx31 << TILE_ORDER; + int c[7]; + int ei[7]; + int eo[7]; + int xstep[7]; + int ystep[7]; int x, y; + + for (i = 0; i < nr_planes; i++) { + c[i] = (tri->plane[i].c + + tri->plane[i].dcdy * iy0 * TILE_SIZE - + tri->plane[i].dcdx * ix0 * TILE_SIZE); + + ei[i] = tri->plane[i].ei << TILE_ORDER; + eo[i] = tri->plane[i].eo << TILE_ORDER; + xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER); + ystep[i] = tri->plane[i].dcdy << TILE_ORDER; + } + /* Test tile-sized blocks against the triangle. @@ -586,32 +729,49 @@ do_triangle_ccw(struct lp_setup_context *setup, * contained inside the tri, bin an lp_rast_shade_tile command. * Else, bin a lp_rast_triangle command. */ - for (y = miny; y <= maxy; y++) + for (y = iy0; y <= iy1; y++) { - int cx1 = c1; - int cx2 = c2; - int cx3 = c3; boolean in = FALSE; /* are we inside the triangle? */ + int cx[7]; + + for (i = 0; i < nr_planes; i++) + cx[i] = c[i]; - for (x = minx; x <= maxx; x++) + for (x = ix0; x <= ix1; x++) { - if (cx1 + eo1 < 0 || - cx2 + eo2 < 0 || - cx3 + eo3 < 0) - { - /* do nothing */ + int out = 0; + int partial = 0; + + for (i = 0; i < nr_planes; i++) { + int planeout = cx[i] + eo[i]; + int planepartial = cx[i] + ei[i] - 1; + out |= (planeout >> 31); + partial |= (planepartial >> 31) & (1< 0 && - cx2 + ei2 > 0 && - cx3 + ei3 > 0) - { + } + else if (partial) { + /* Not trivially accepted by at least one plane - + * rasterize/shade partial tile + */ + int count = util_bitcount(partial); + in = TRUE; + lp_scene_bin_command( scene, x, y, + lp_rast_tri_tab[count], + lp_rast_arg_triangle(tri, partial) ); + + LP_COUNT(nr_partially_covered_64); + } + else { /* triangle covers the whole tile- shade whole tile */ LP_COUNT(nr_fully_covered_64); - in = TRUE; - if (setup->fs.current.variant->opaque && + in = TRUE; + if (variant->opaque && !setup->fb.zsbuf) { lp_scene_bin_reset( scene, x, y ); lp_scene_bin_command( scene, x, y, @@ -621,29 +781,18 @@ do_triangle_ccw(struct lp_setup_context *setup, lp_scene_bin_command( scene, x, y, lp_rast_shade_tile, lp_rast_arg_inputs(&tri->inputs) ); - } - else - { - /* rasterizer/shade partial tile */ - LP_COUNT(nr_partially_covered_64); - in = TRUE; - lp_scene_bin_command( scene, x, y, - lp_rast_triangle, - lp_rast_arg_triangle(tri) ); - } + } /* Iterate cx values across the region: */ - cx1 += xstep1; - cx2 += xstep2; - cx3 += xstep3; + for (i = 0; i < nr_planes; i++) + cx[i] += xstep[i]; } /* Iterate c values down the region: */ - c1 += ystep1; - c2 += ystep2; - c3 += ystep3; + for (i = 0; i < nr_planes; i++) + c[i] += ystep[i]; } } } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 6511505..5953d69 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -31,9 +31,6 @@ * Code generate the whole fragment pipeline. * * The fragment pipeline consists of the following stages: - * - triangle edge in/out testing - * - scissor test - * - stipple (TBI) * - early depth test * - fragment shader * - alpha test @@ -97,6 +94,7 @@ #include "lp_state.h" #include "lp_tex_sample.h" #include "lp_flush.h" +#include "lp_state_fs.h" #include @@ -170,177 +168,63 @@ generate_depth_stencil(LLVMBuilderRef builder, /** - * Generate the code to do inside/outside triangle testing for the + * Expand the relevent bits of mask_input to a 4-dword mask for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. - * \param i which quad of the quad group to test, in [0,3] + * + * \param quad which quad of the quad group to test, in [0,3] + * \param mask_input bitwise mask for the whole 4x4 stamp */ -static void -generate_tri_edge_mask(LLVMBuilderRef builder, - unsigned i, - LLVMValueRef *mask, /* ivec4, out */ - LLVMValueRef c0, /* int32 */ - LLVMValueRef c1, /* int32 */ - LLVMValueRef c2, /* int32 */ - LLVMValueRef step0_ptr, /* ivec4 */ - LLVMValueRef step1_ptr, /* ivec4 */ - LLVMValueRef step2_ptr) /* ivec4 */ +static LLVMValueRef +generate_quad_mask(LLVMBuilderRef builder, + struct lp_type fs_type, + unsigned quad, + LLVMValueRef mask_input) /* int32 */ { -#define OPTIMIZE_IN_OUT_TEST 0 -#if OPTIMIZE_IN_OUT_TEST - struct lp_build_if_state ifctx; - LLVMValueRef not_draw_all; -#endif - struct lp_build_flow_context *flow; - struct lp_type i32_type; - LLVMTypeRef i32vec4_type; - LLVMValueRef c0_vec, c1_vec, c2_vec; - LLVMValueRef in_out_mask; - - assert(i < 4); - - /* int32 vector type */ - memset(&i32_type, 0, sizeof i32_type); - i32_type.floating = FALSE; /* values are integers */ - i32_type.sign = TRUE; /* values are signed */ - i32_type.norm = FALSE; /* values are not normalized */ - i32_type.width = 32; /* 32-bit int values */ - i32_type.length = 4; /* 4 elements per vector */ - - i32vec4_type = lp_build_int32_vec4_type(); + struct lp_type mask_type; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef bits[4]; + LLVMValueRef mask; /* - * Use a conditional here to do detailed pixel in/out testing. - * We only have to do this if c0 != INT_MIN. + * XXX: We'll need a different path for 16 x u8 */ - flow = lp_build_flow_create(builder); - lp_build_flow_scope_begin(flow); - - { -#if OPTIMIZE_IN_OUT_TEST - /* not_draw_all = (c0 != INT_MIN) */ - not_draw_all = LLVMBuildICmp(builder, - LLVMIntNE, - c0, - LLVMConstInt(LLVMInt32Type(), INT_MIN, 0), - ""); - - in_out_mask = lp_build_const_int_vec(i32_type, ~0); - - - lp_build_flow_scope_declare(flow, &in_out_mask); - - /* if (not_draw_all) {... */ - lp_build_if(&ifctx, flow, builder, not_draw_all); -#endif - { - LLVMValueRef step0_vec, step1_vec, step2_vec; - LLVMValueRef m0_vec, m1_vec, m2_vec; - LLVMValueRef index, m; - - /* c0_vec = {c0, c0, c0, c0} - * Note that we emit this code four times but LLVM optimizes away - * three instances of it. - */ - c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); - c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); - c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); - lp_build_name(c0_vec, "edgeconst0vec"); - lp_build_name(c1_vec, "edgeconst1vec"); - lp_build_name(c2_vec, "edgeconst2vec"); - - /* load step0vec, step1, step2 vec from memory */ - index = LLVMConstInt(LLVMInt32Type(), i, 0); - step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); - step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); - step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); - lp_build_name(step0_vec, "step0vec"); - lp_build_name(step1_vec, "step1vec"); - lp_build_name(step2_vec, "step2vec"); - - /* m0_vec = step0_ptr[i] > c0_vec */ - m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec); - m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec); - m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec); - - /* in_out_mask = m0_vec & m1_vec & m2_vec */ - m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); - in_out_mask = LLVMBuildAnd(builder, m, m2_vec, ""); - lp_build_name(in_out_mask, "inoutmaskvec"); - } -#if OPTIMIZE_IN_OUT_TEST - lp_build_endif(&ifctx); -#endif - - } - lp_build_flow_scope_end(flow); - lp_build_flow_destroy(flow); + assert(fs_type.width == 32); + assert(fs_type.length == 4); + mask_type = lp_int_type(fs_type); - /* This is the initial alive/dead pixel mask for a quad of four pixels. - * It's an int[4] vector with each word set to 0 or ~0. - * Words will get cleared when pixels faile the Z test, etc. + /* + * mask_input >>= (quad * 4) */ - *mask = in_out_mask; -} - - -static LLVMValueRef -generate_scissor_test(LLVMBuilderRef builder, - LLVMValueRef context_ptr, - const struct lp_build_interp_soa_context *interp, - struct lp_type type) -{ - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1]; - LLVMValueRef xmin, ymin, xmax, ymax; - LLVMValueRef m0, m1, m2, m3, m; - - /* xpos, ypos contain the window coords for the four pixels in the quad */ - assert(xpos); - assert(ypos); - - /* get the current scissor bounds, convert to vectors */ - xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr); - xmin = lp_build_broadcast(builder, vec_type, xmin); - - ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr); - ymin = lp_build_broadcast(builder, vec_type, ymin); - xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr); - xmax = lp_build_broadcast(builder, vec_type, xmax); + mask_input = LLVMBuildLShr(builder, + mask_input, + LLVMConstInt(i32t, quad * 4, 0), + ""); - ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr); - ymax = lp_build_broadcast(builder, vec_type, ymax); + /* + * mask = { mask_input & (1 << i), for i in [0,3] } + */ - /* compare the fragment's position coordinates against the scissor bounds */ - m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin); - m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin); - m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax); - m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax); + mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); - /* AND all the masks together */ - m = LLVMBuildAnd(builder, m0, m1, ""); - m = LLVMBuildAnd(builder, m, m2, ""); - m = LLVMBuildAnd(builder, m, m3, ""); + bits[0] = LLVMConstInt(i32t, 1 << 0, 0); + bits[1] = LLVMConstInt(i32t, 1 << 1, 0); + bits[2] = LLVMConstInt(i32t, 1 << 2, 0); + bits[3] = LLVMConstInt(i32t, 1 << 3, 0); - lp_build_name(m, "scissormask"); + mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); - return m; -} + /* + * mask = mask != 0 ? ~0 : 0 + */ + mask = lp_build_compare(builder, + mask_type, PIPE_FUNC_NOTEQUAL, + mask, + lp_build_const_int_vec(mask_type, 0)); -static LLVMValueRef -build_int32_vec_const(int value) -{ - struct lp_type i32_type; - - memset(&i32_type, 0, sizeof i32_type); - i32_type.floating = FALSE; /* values are integers */ - i32_type.sign = TRUE; /* values are signed */ - i32_type.norm = FALSE; /* values are not normalized */ - i32_type.width = 32; /* 32-bit int values */ - i32_type.length = 4; /* 4 elements per vector */ - return lp_build_const_int_vec(i32_type, value); + return mask; } @@ -348,7 +232,7 @@ build_int32_vec_const(int value) /** * Generate the fragment shader, depth/stencil test, and alpha tests. * \param i which quad in the tile, in range [0,3] - * \param do_tri_test if 1, do triangle edge in/out testing + * \param partial_mask if 1, do mask_input testing */ static void generate_fs(struct llvmpipe_context *lp, @@ -364,13 +248,8 @@ generate_fs(struct llvmpipe_context *lp, LLVMValueRef (*color)[4], LLVMValueRef depth_ptr, LLVMValueRef facing, - unsigned do_tri_test, - LLVMValueRef c0, - LLVMValueRef c1, - LLVMValueRef c2, - LLVMValueRef step0_ptr, - LLVMValueRef step1_ptr, - LLVMValueRef step2_ptr, + unsigned partial_mask, + LLVMValueRef mask_input, LLVMValueRef counter) { const struct tgsi_token *tokens = shader->base.tokens; @@ -411,23 +290,17 @@ generate_fs(struct llvmpipe_context *lp, lp_build_flow_scope_declare(flow, &z); /* do triangle edge testing */ - if (do_tri_test) { - generate_tri_edge_mask(builder, i, pmask, - c0, c1, c2, step0_ptr, step1_ptr, step2_ptr); + if (partial_mask) { + *pmask = generate_quad_mask(builder, type, + i, mask_input); } else { - *pmask = build_int32_vec_const(~0); + *pmask = lp_build_const_int_vec(type, ~0); } /* 'mask' will control execution based on quad's pixel alive/killed state */ lp_build_mask_begin(&mask, flow, type, *pmask); - if (key->scissor) { - LLVMValueRef smask = - generate_scissor_test(builder, context_ptr, interp, type); - lp_build_mask_update(&mask, smask); - } - early_depth_stencil_test = (key->depth.enabled || key->stencil[0].enabled) && !key->alpha.enabled && @@ -579,7 +452,7 @@ static void generate_fragment(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, struct lp_fragment_shader_variant *variant, - unsigned do_tri_test) + unsigned partial_mask) { struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); const struct lp_fragment_shader_variant_key *key = &variant->key; @@ -589,9 +462,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMTypeRef fs_elem_type; LLVMTypeRef fs_int_vec_type; LLVMTypeRef blend_vec_type; - LLVMTypeRef arg_types[16]; + LLVMTypeRef arg_types[11]; LLVMTypeRef func_type; - LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type(); LLVMValueRef context_ptr; LLVMValueRef x; LLVMValueRef y; @@ -600,7 +472,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef dady_ptr; LLVMValueRef color_ptr_ptr; LLVMValueRef depth_ptr; - LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr, counter = NULL; + LLVMValueRef mask_input; + LLVMValueRef counter = NULL; LLVMBasicBlockRef block; LLVMBuilderRef builder; struct lp_build_sampler_soa *sampler; @@ -645,7 +518,7 @@ generate_fragment(struct llvmpipe_context *lp, blend_vec_type = lp_build_vec_type(blend_type); util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", - shader->no, variant->no, do_tri_test ? "edge" : "whole"); + shader->no, variant->no, partial_mask ? "partial" : "whole"); arg_types[0] = screen->context_ptr_type; /* context */ arg_types[1] = LLVMInt32Type(); /* x */ @@ -656,23 +529,15 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */ arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */ arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ - arg_types[9] = LLVMInt32Type(); /* c0 */ - arg_types[10] = LLVMInt32Type(); /* c1 */ - arg_types[11] = LLVMInt32Type(); /* c2 */ - /* Note: the step arrays are built as int32[16] but we interpret - * them here as int32_vec4[4]. - */ - arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step0 */ - arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step1 */ - arg_types[14] = LLVMPointerType(int32_vec4_type, 0);/* step2 */ - arg_types[15] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ + arg_types[9] = LLVMInt32Type(); /* mask_input */ + arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); function = LLVMAddFunction(screen->module, func_name, func_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); - variant->function[do_tri_test] = function; + variant->function[partial_mask] = function; /* XXX: need to propagate noalias down into color param now we are @@ -691,12 +556,7 @@ generate_fragment(struct llvmpipe_context *lp, dady_ptr = LLVMGetParam(function, 6); color_ptr_ptr = LLVMGetParam(function, 7); depth_ptr = LLVMGetParam(function, 8); - c0 = LLVMGetParam(function, 9); - c1 = LLVMGetParam(function, 10); - c2 = LLVMGetParam(function, 11); - step0_ptr = LLVMGetParam(function, 12); - step1_ptr = LLVMGetParam(function, 13); - step2_ptr = LLVMGetParam(function, 14); + mask_input = LLVMGetParam(function, 9); lp_build_name(context_ptr, "context"); lp_build_name(x, "x"); @@ -706,15 +566,10 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(dady_ptr, "dady"); lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); - lp_build_name(c0, "c0"); - lp_build_name(c1, "c1"); - lp_build_name(c2, "c2"); - lp_build_name(step0_ptr, "step0"); - lp_build_name(step1_ptr, "step1"); - lp_build_name(step2_ptr, "step2"); + lp_build_name(mask_input, "mask_input"); if (key->occlusion_count) { - counter = LLVMGetParam(function, 15); + counter = LLVMGetParam(function, 10); lp_build_name(counter, "counter"); } @@ -763,9 +618,9 @@ generate_fragment(struct llvmpipe_context *lp, out_color, depth_ptr_i, facing, - do_tri_test, - c0, c1, c2, - step0_ptr, step1_ptr, step2_ptr, counter); + partial_mask, + mask_input, + counter); for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) for(chan = 0; chan < NUM_CHANNELS; ++chan) @@ -792,9 +647,13 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); } - lp_build_conv_mask(builder, fs_type, blend_type, - fs_mask, num_fs, - &blend_mask, 1); + if (partial_mask || !variant->opaque) { + lp_build_conv_mask(builder, fs_type, blend_type, + fs_mask, num_fs, + &blend_mask, 1); + } else { + blend_mask = lp_build_const_int_vec(blend_type, ~0); + } color_ptr = LLVMBuildLoad(builder, LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""), @@ -832,8 +691,7 @@ generate_fragment(struct llvmpipe_context *lp, #endif /* Apply optimizations to LLVM IR */ - if (1) - LLVMRunFunctionPassManager(screen->pass, function); + LLVMRunFunctionPassManager(screen->pass, function); if (gallivm_debug & GALLIVM_DEBUG_IR) { /* Print the LLVM IR to stderr */ @@ -847,7 +705,7 @@ generate_fragment(struct llvmpipe_context *lp, { void *f = LLVMGetPointerToGlobal(screen->engine, function); - variant->jit_function[do_tri_test] = (lp_jit_frag_func)pointer_to_func(f); + variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f); if (gallivm_debug & GALLIVM_DEBUG_ASM) { lp_disassemble(f); @@ -963,7 +821,6 @@ generate_variant(struct llvmpipe_context *lp, !key->stencil[0].enabled && !key->alpha.enabled && !key->depth.enabled && - !key->scissor && !shader->info.uses_kill ? TRUE : FALSE; @@ -1182,7 +1039,6 @@ make_variant_key(struct llvmpipe_context *lp, /* alpha.ref_value is passed in jit_context */ key->flatshade = lp->rasterizer->flatshade; - key->scissor = lp->rasterizer->scissor; if (lp->active_query_count) { key->occlusion_count = TRUE; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 593cd4d..37900fc 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -54,7 +54,6 @@ struct lp_fragment_shader_variant_key enum pipe_format zsbuf_format; unsigned nr_cbufs:8; unsigned flatshade:1; - unsigned scissor:1; unsigned occlusion_count:1; struct { -- 2.7.4