nvfx: simplify and correct fragment program update logic
authorLuca Barbieri <luca@luca-barbieri.com>
Sun, 22 Aug 2010 10:02:41 +0000 (12:02 +0200)
committerLuca Barbieri <luca@luca-barbieri.com>
Sun, 22 Aug 2010 13:42:17 +0000 (15:42 +0200)
This version should hopefully be much clearer and thus less likely
to be subtly broken.

Also fixes point sprites on nv40 and possibly some other bugs too.

src/gallium/drivers/nvfx/nvfx_context.c
src/gallium/drivers/nvfx/nvfx_context.h
src/gallium/drivers/nvfx/nvfx_fragprog.c
src/gallium/drivers/nvfx/nvfx_state.h
src/gallium/drivers/nvfx/nvfx_state_emit.c
src/gallium/drivers/nvfx/nvfx_vertprog.c

index 8e85201..e78fc14 100644 (file)
@@ -90,6 +90,8 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 
        /* set these to that we init them on first validation */
        nvfx->state.scissor_enabled = ~0;
+       nvfx->hw_pointsprite_control = -1;
+       nvfx->hw_vp_output = -1;
        nvfx->use_vertex_buffers = -1;
 
        LIST_INITHEAD(&nvfx->render_cache);
index 63fbce8..fb4a9da 100644 (file)
@@ -193,6 +193,8 @@ struct nvfx_context {
        uint32_t hw_txf[8];
        struct nvfx_render_target hw_rt[4];
        struct nvfx_render_target hw_zeta;
+       int hw_pointsprite_control;
+       int hw_vp_output;
 };
 
 static INLINE struct nvfx_context *
index 025989a..e40a814 100644 (file)
@@ -1101,6 +1101,35 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 #endif
 }
 
+/* The hardware only supports immediate constants inside the fragment program,
+ * and at least on nv30 doesn't support an indirect linkage table.
+ *
+ * Hence, we need to patch the fragment program itself both to update constants
+ * and update linkage.
+ *
+ * Using a single fragment program would entail unacceptable stalls if the GPU is
+ * already rendering with that fragment program.
+ * Thus, we instead use a "rotating queue" of buffer objects, each of which is
+ * packed with multiple versions of the same program.
+ *
+ * Whenever we need to patch something, we move to the next program and
+ * patch it. If all buffer objects are in use by the GPU, we allocate another one,
+ * expanding the queue.
+ *
+ * As an additional optimization, we record when all the programs have the
+ * current input slot configuration, and at that point we stop patching inputs.
+ * This happens, for instance, if a given fragment program is always used with
+ * the same vertex program (i.e. always with GLSL), or if the layouts match
+ * enough (non-GLSL).
+ *
+ * Note that instead of using multiple programs, we could push commands
+ * on the FIFO to patch a single program: it's not fully clear which option is
+ * faster, but my guess is that the current way is faster.
+ *
+ * We also track the previous slot assignments for each version and don't
+ * patch if they are the same (this could perhaps be removed).
+ */
+
 void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
@@ -1109,6 +1138,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
        int update = 0;
        struct nvfx_vertex_program* vp;
        unsigned sprite_coord_enable;
+       boolean update_pointsprite = !!(nvfx->dirty & NVFX_NEW_FRAGPROG);
 
        if (!fp->translated)
        {
@@ -1141,80 +1171,96 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
                fp->bo_prog_idx = fp->progs_per_bo - 1;
        }
 
-       /* we must update constants even on "just" fragprog changes, because
-          we don't check whether the current constant buffer matches the latest
-          one bound to this fragment program */
-       if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
-               update = TRUE;
-
        vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
-       if (fp->last_vp_id != vp->id) {
-               char* vp_sem_table = vp->generic_to_fp_input;
-               unsigned char* fp_semantics = fp->slot_to_generic;
-               unsigned diff = 0;
-               unsigned char* cur_slots;
-               fp->last_vp_id = nvfx->vertprog->id;
-               cur_slots = fp->slot_to_fp_input;
-               for(unsigned i = 0; i < fp->num_slots; ++i) {
-                       unsigned char slot_mask = vp_sem_table[fp_semantics[i]];
-                       diff |= (slot_mask >> 4) & (slot_mask ^ cur_slots[i]);
-               }
+        sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
+
+       if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
+               int sprite_input = -1;
+               unsigned i;
+               fp->last_vp_id = vp->id;
+               fp->last_sprite_coord_enable = sprite_coord_enable;
 
-               if(diff)
+               if(sprite_coord_enable)
                {
-                       for(unsigned i = 0; i < fp->num_slots; ++i) {
-                               /* if 0xff, then this will write to the dummy value at fp->last_layout_mask[0] */
-                               fp->slot_to_fp_input[i] = vp_sem_table[fp_semantics[i]] & 0xf;
-                               //printf("fp: GENERIC[%i] from fpreg %i\n", fp_semantics[i], fp->slot_to_fp_input[i]);
+                       sprite_input = vp->sprite_fp_input;
+                       if(sprite_input < 0)
+                       {
+                               unsigned used_texcoords = 0;
+                               for(unsigned i = 0; i < fp->num_slots; ++i) {
+                                       unsigned generic = fp->slot_to_generic[i];
+                                       if(!((1 << generic) & sprite_coord_enable))
+                                       {
+                                               unsigned char slot_mask = vp->generic_to_fp_input[generic];
+                                               if(slot_mask >= 0xf0)
+                                                       used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
+                                       }
+                               }
+
+                               sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
                        }
 
-                       fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-                       update = TRUE;
+                       fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
                }
-       }
+               else
+                       fp->point_sprite_control = 0;
 
-       // last_sprite_coord_enable
-       sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
-       if(fp->last_sprite_coord_enable != sprite_coord_enable)
-       {
-               unsigned texcoord_mask = vp->texcoord_ouput_mask;
-               fp->last_sprite_coord_enable = sprite_coord_enable;
-               fp->point_sprite_control = 0;
-               for(unsigned i = 0; i < fp->num_slots; ++i) {
-                       if((1 << fp->slot_to_generic[i]) & sprite_coord_enable)
+               for(i = 0; i < fp->num_slots; ++i) {
+                       unsigned generic = fp->slot_to_generic[i];
+                       if((1 << generic) & sprite_coord_enable)
                        {
-                               unsigned fpin = fp->slot_to_fp_input[i];
-                               //printf("sprite: slot %i generic %i had texcoord %i\n", i, fp->slot_to_generic[i], fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-                               if(fpin >= 0x0f)
-                               {
-                                       unsigned tc = __builtin_ctz(~texcoord_mask);
-                                       texcoord_mask |= (1 << tc);
-                                       fp->slot_to_fp_input[i] = fpin = NVFX_FP_OP_INPUT_SRC_TC(tc);
-
-                                       fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-                                       update = TRUE;
-                               }
-                               //printf("sprite: slot %i texcoord %i\n", i, fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-                               fp->point_sprite_control |= (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+                               if(fp->slot_to_fp_input[i] != sprite_input)
+                                       goto update_slots;
                        }
                        else
                        {
-                               unsigned fpin = fp->slot_to_fp_input[i];
-                               if(!(vp->texcoord_ouput_mask & (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0))))
-                               {
-                                       fp->slot_to_fp_input[i] = 0x0f;
+                               unsigned char slot_mask = vp->generic_to_fp_input[generic];
+                               if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
+                                       goto update_slots;
+                       }
+               }
+
+               if(0)
+               {
+update_slots:
+                       /* optimization: we start updating from the slot we found the first difference in */
+                       for(; i < fp->num_slots; ++i)
+                       {
+                               unsigned generic = fp->slot_to_generic[i];
+                               if((1 << generic) & sprite_coord_enable)
+                                       fp->slot_to_fp_input[i] = sprite_input;
+                               else
+                                       fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
+                       }
 
-                                       fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-                                       update = TRUE;
+                       if(nvfx->is_nv4x)
+                       {
+                               fp->or = 0;
+                               for(i = 0; i < fp->num_slots; ++i) {
+                                       unsigned fp_input = fp->slot_to_fp_input[i];
+                                       if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
+                                               fp->or |= (1 << 12);
+                                       else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
+                                               fp->or |= (1 << 13);
+                                       else if(fp_input != 0xf)
+                                               fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
                                }
                        }
+
+                       fp->progs_left_with_obsolete_slot_assignments = fp->progs;
+                       goto update;
                }
        }
 
-       if(update) {
+       /* We must update constants even on "just" fragprog changes, because
+         * we don't check whether the current constant buffer matches the latest
+         * one bound to this fragment program.
+         * Doing such a check would likely be a pessimization.
+         */
+       if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) {
                int offset;
                uint32_t* fpmap;
 
+update:
                ++fp->bo_prog_idx;
                if(fp->bo_prog_idx >= fp->progs_per_bo)
                {
@@ -1278,6 +1324,9 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
                        }
                }
 
+               /* we only do this if we aren't sure that all program versions have the
+                * current slot assignments, otherwise we just update constants for speed
+                */
                if(fp->progs_left_with_obsolete_slot_assignments) {
                        unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
                        for(unsigned i = 0; i < fp->num_slots; ++i) {
@@ -1296,10 +1345,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
                        }
                        --fp->progs_left_with_obsolete_slot_assignments;
                }
-       }
 
-       if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
-               int offset = fp->bo_prog_idx * fp->prog_size;
                MARK_RING(chan, 8, 1);
                OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
                OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -1316,11 +1362,28 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
                }
        }
 
-       if(nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SPRITE))
        {
-               WAIT_RING(chan, 2);
-               OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
-               OUT_RING(chan, fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization);
+               unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
+               if(pointsprite_control != nvfx->hw_pointsprite_control)
+               {
+                       WAIT_RING(chan, 2);
+                       OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
+                       OUT_RING(chan, pointsprite_control);
+                       nvfx->hw_pointsprite_control = pointsprite_control;
+               }
+       }
+
+       if(nvfx->is_nv4x)
+       {
+               unsigned vp_output = vp->or | fp->or;
+
+               if(vp_output != nvfx->hw_vp_output)
+               {
+                       WAIT_RING(chan, 2);
+                       OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+                       OUT_RING(chan, vp_output);
+                       nvfx->hw_vp_output = vp_output;
+               }
        }
 }
 
index 1247abc..05d41cf 100644 (file)
@@ -32,7 +32,7 @@ struct nvfx_vertex_program {
        unsigned nr_consts;
 
        char generic_to_fp_input[256];
-       unsigned texcoord_ouput_mask;
+       int sprite_fp_input;
 
        struct nouveau_resource *exec;
        unsigned exec_start;
@@ -67,6 +67,7 @@ struct nvfx_fragment_program {
        boolean translated;
        unsigned samplers;
        unsigned point_sprite_control;
+       unsigned or;
 
        uint32_t *insn;
        int       insn_len;
index 8e3c342..bd89a38 100644 (file)
@@ -17,6 +17,8 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
        {
                nvfx->dirty = ~0;
                nvfx->hw_vtxelt_nr = 16;
+               nvfx->hw_pointsprite_control = -1;
+               nvfx->hw_vp_output = -1;
                nvfx->screen->cur_ctx = nvfx;
        }
 
index 3c3521e..806f263 100644 (file)
@@ -235,24 +235,10 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
                        dst.index = NVFX_VP(INST_DEST_PSZ);
                        break;
                default:
-                       if(!nvfx->is_nv4x) {
-                               switch (dst.index) {
-                               case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-                               case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-                               case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-                               case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-                               case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-                               case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-                               case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-                               case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-                               case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-                               case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-                               case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-                               case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-                               case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-                               case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-                               }
-                       } else {
+                       if(nvfx->is_nv4x) {
+                               /* we don't need vp->or on nv3x
+                                * texcoords are handled by fragment program
+                                */
                                switch (dst.index) {
                                case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
                                case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
@@ -260,14 +246,6 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
                                case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
                                case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
                                case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-                               case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-                               case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-                               case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-                               case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-                               case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-                               case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-                               case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-                               case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
                                }
                        }
                        break;
@@ -817,13 +795,21 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 
        /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
        memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
-       vpc->vp->texcoord_ouput_mask = 0;
        for(int i = 0; i < 8; ++i) {
                if(sem_layout[i] == 0xff)
                        continue;
-               vpc->vp->texcoord_ouput_mask |= (1 << i);
                //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
-               vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | (NVFX_FP_OP_INPUT_SRC_TC(0) + i);
+               vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
+       }
+
+       vpc->vp->sprite_fp_input = -1;
+       for(int i = 0; i < 8; ++i)
+       {
+               if(sem_layout[i] == 0xff)
+               {
+                       vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
+                       break;
+               }
        }
 
        tgsi_parse_init(&p, vpc->vp->pipe.tokens);
@@ -1233,13 +1219,12 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
        if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
        {
-               WAIT_RING(chan, 7);
+               WAIT_RING(chan, 6);
                OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
                OUT_RING(chan, vp->exec->start);
                if(nvfx->is_nv4x) {
-                       OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+                       OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
                        OUT_RING(chan, vp->ir);
-                       OUT_RING(chan, vp->or);
                }
                OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
                OUT_RING(chan, vp->clip_ctrl);