From: Eric Anholt Date: Sat, 1 Dec 2012 00:34:09 +0000 (-0800) Subject: i965: Ask the register allocator to round-robin through registers. X-Git-Tag: mesa-9.2.1~1948 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b6e9b54d066d29d250c4502e0005b317589dd8b3;p=platform%2Fupstream%2Fmesa.git i965: Ask the register allocator to round-robin through registers. The way we were allocating registers before, packing into low register numbers for Ironlake, resulted in an overly-constrained dependency graph for instruction scheduling. Improves GLBenchmark 2.1 performance by 4.5% +/- 0.7% (n=26). No difference on my old GLSL demo (n=20). No difference on nexuiz (n=15). v2: Fix off-by-one bug that made the change only work for 16-wide on i965. Reviewed-by: Matt Turner Reviewed-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 4ee7bbc..b9b0303 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -108,6 +108,8 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width) uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count); + if (intel->gen >= 6) + ra_set_allocate_round_robin(regs); int *classes = ralloc_array(brw, int, class_count); int aligned_pairs_class = -1; diff --git a/src/mesa/program/register_allocate.c b/src/mesa/program/register_allocate.c index a9064c3..2c826fc 100644 --- a/src/mesa/program/register_allocate.c +++ b/src/mesa/program/register_allocate.c @@ -70,6 +70,7 @@ * this during ra_set_finalize(). */ +#include #include #include "main/imports.h" @@ -93,6 +94,8 @@ struct ra_regs { struct ra_class **classes; unsigned int class_count; + + bool round_robin; }; struct ra_class { @@ -185,6 +188,22 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count) return regs; } +/** + * The register allocator by default prefers to allocate low register numbers, + * since it was written for hardware (gen4/5 Intel) that is limited in its + * multithreadedness by the number of registers used in a given shader. + * + * However, for hardware without that restriction, densely packed register + * allocation can put serious constraints on instruction scheduling. This + * function tells the allocator to rotate around the registers if possible as + * it allocates the nodes. + */ +void +ra_set_allocate_round_robin(struct ra_regs *regs) +{ + regs->round_robin = true; +} + static void ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2) { @@ -436,16 +455,19 @@ GLboolean ra_select(struct ra_graph *g) { int i; + int start_search_reg = 0; while (g->stack_count != 0) { - unsigned int r; + unsigned int ri; + unsigned int r = -1; int n = g->stack[g->stack_count - 1]; struct ra_class *c = g->regs->classes[g->nodes[n].class]; /* Find the lowest-numbered reg which is not used by a member * of the graph adjacent to us. */ - for (r = 0; r < g->regs->count; r++) { + for (ri = 0; ri < g->regs->count; ri++) { + r = (start_search_reg + ri) % g->regs->count; if (!c->regs[r]) continue; @@ -461,12 +483,15 @@ ra_select(struct ra_graph *g) if (i == g->nodes[n].adjacency_count) break; } - if (r == g->regs->count) + if (ri == g->regs->count) return GL_FALSE; g->nodes[n].reg = r; g->nodes[n].in_stack = GL_FALSE; g->stack_count--; + + if (g->regs->round_robin) + start_search_reg = r + 1; } return GL_TRUE; diff --git a/src/mesa/program/register_allocate.h b/src/mesa/program/register_allocate.h index 2a9d611..fa119e3 100644 --- a/src/mesa/program/register_allocate.h +++ b/src/mesa/program/register_allocate.h @@ -37,6 +37,7 @@ struct ra_regs; * two real registers from which they are composed. */ struct ra_regs *ra_alloc_reg_set(void *mem_ctx, unsigned int count); +void ra_set_allocate_round_robin(struct ra_regs *regs); unsigned int ra_alloc_reg_class(struct ra_regs *regs); void ra_add_reg_conflict(struct ra_regs *regs, unsigned int r1, unsigned int r2);