radeonsi: completely rework updating descriptors without CP DMA
authorMarek Olšák <marek.olsak@amd.com>
Fri, 24 Jul 2015 22:53:16 +0000 (00:53 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Fri, 31 Jul 2015 14:49:16 +0000 (16:49 +0200)
The patch has a better explanation. Just a summary here:
- The CPU always uploads a whole descriptor array to previously-unused memory.
- CP DMA isn't used.
- No caches need to be flushed.
- All descriptors are always up-to-date in memory even after a hang, because
  CP DMA doesn't serve as a middle man to update them.

This should bring:
- better hang recovery (descriptors are always up-to-date)
- better GPU performance (no KCACHE and TC flushes)
- worse CPU performance for partial updates (only whole arrays are uploaded)
- less used IB space (no CP_DMA and WRITE_DATA packets)
- simpler code
- hopefully, some of the corruption issues with SI cards will go away.
  If not, we'll know the issue is not here.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c

index 14bb6e1..48ec9b7 100644 (file)
  *      Marek Olšák <marek.olsak@amd.com>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -139,159 +147,62 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
        }
 }
 
-static void si_init_descriptors(struct si_context *sctx,
-                               struct si_descriptors *desc,
+static void si_init_descriptors(struct si_descriptors *desc,
                                unsigned shader_userdata_index,
                                unsigned element_dw_size,
-                               unsigned num_elements,
-                               void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+                               unsigned num_elements)
 {
+       int i;
+
        assert(num_elements <= sizeof(desc->enabled_mask)*8);
-       assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
-       desc->atom.emit = (void*)emit_func;
-       desc->shader_userdata_offset = shader_userdata_index * 4;
+       desc->list = CALLOC(num_elements, element_dw_size * 4);
        desc->element_dw_size = element_dw_size;
        desc->num_elements = num_elements;
-       desc->context_size = num_elements * element_dw_size * 4;
-
-       desc->buffer = (struct r600_resource*)
-               pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-                                  PIPE_USAGE_DEFAULT,
-                                  SI_NUM_CONTEXTS * desc->context_size);
-
-       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-                             RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
+       desc->list_dirty = true; /* upload the list before the next draw */
+       desc->shader_userdata_offset = shader_userdata_index * 4;
 
-       /* We don't check for CS space here, because this should be called
-        * only once at context initialization. */
-       si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-                                   desc->buffer->b.b.width0, 0,
-                                   R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+       /* Initialize the array to NULL descriptors if the element size is 8. */
+       if (element_dw_size == 8)
+               for (i = 0; i < num_elements; i++)
+                       memcpy(desc->list + i*element_dw_size, null_descriptor,
+                              sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
        pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+       FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
                                  struct si_descriptors *desc)
 {
-       if (desc->dirty_mask) {
-               desc->atom.num_dw =
-                       7 + /* copy */
-                       (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
-
-               desc->atom.dirty = true;
-               desc->pointer_dirty = true;
-               sctx->shader_userdata.atom.dirty = true;
-
-               /* TODO: Investigate if these flushes can be removed after
-                * adding CE support. */
-
-               /* The descriptors are read with the K cache. */
-               sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
-
-               /* Since SI uses uncached CP DMA to update descriptors,
-                * we have to flush TC L2, which is used to fetch constants
-                * along with KCACHE. */
-               if (sctx->b.chip_class == SI)
-                       sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-       } else {
-               desc->atom.dirty = false;
-       }
-}
+       unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+       void *ptr;
 
-static void si_emit_descriptors(struct si_context *sctx,
-                               struct si_descriptors *desc,
-                               uint32_t **descriptors)
-{
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-       uint64_t va_base;
-       int packet_start = 0;
-       int packet_size = 0;
-       int last_index = desc->num_elements; /* point to a non-existing element */
-       uint64_t dirty_mask = desc->dirty_mask;
-       unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-       assert(dirty_mask);
-
-       va_base = desc->buffer->gpu_address;
-
-       /* Copy the descriptors to a new context slot. */
-       si_emit_cp_dma_copy_buffer(sctx,
-                                  va_base + new_context_id * desc->context_size,
-                                  va_base + desc->current_context_id * desc->context_size,
-                                  desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-       va_base += new_context_id * desc->context_size;
-
-       /* Update the descriptors.
-        * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-        *
-        * XXX When unbinding lots of resources, consider clearing the memory
-        *     with CP DMA instead of emitting zeros.
-        */
-       while (dirty_mask) {
-               int i = u_bit_scan64(&dirty_mask);
-
-               assert(i < desc->num_elements);
+       if (!desc->list_dirty)
+               return true;
 
-               if (last_index+1 == i && packet_size) {
-                       /* Append new data at the end of the last packet. */
-                       packet_size += desc->element_dw_size;
-                       cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-               } else {
-                       /* Start a new packet. */
-                       uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-                       packet_start = cs->cdw;
-                       packet_size = 2 + desc->element_dw_size;
-
-                       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-                       radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-                                               PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-                                               PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-                                            PKT3_WRITE_DATA_WR_CONFIRM |
-                                            PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-                       radeon_emit(cs, va & 0xFFFFFFFFUL);
-                       radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-               }
+       u_upload_alloc(sctx->b.uploader, 0, list_size,
+                      &desc->buffer_offset,
+                      (struct pipe_resource**)&desc->buffer, &ptr);
+       if (!desc->buffer)
+               return false; /* skip the draw call */
 
-               radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
+       util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-               last_index = i;
-       }
+       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+                             RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-       desc->dirty_mask = 0;
-       desc->current_context_id = new_context_id;
+       desc->list_dirty = false;
+       desc->pointer_dirty = true;
+       sctx->shader_userdata.atom.dirty = true;
+       return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-       struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-       si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-                                 struct si_sampler_views *views)
-{
-       int i;
-
-       si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
-                           8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-       for (i = 0; i < views->desc.num_elements; i++) {
-               views->desc_data[i] = null_descriptor;
-               views->desc.dirty_mask |= 1llu << i;
-       }
-       si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
        int i;
@@ -332,6 +243,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
                                      si_get_resource_ro_priority(rview->resource));
        }
 
+       if (!views->desc.buffer)
+               return;
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -354,17 +267,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
                                rview->resource, RADEON_USAGE_READ,
                                si_get_resource_ro_priority(rview->resource));
 
-
                pipe_sampler_view_reference(&views->views[slot], view);
-               views->desc_data[slot] = view_desc;
+               memcpy(views->desc.list + slot*8, view_desc, 8*4);
                views->desc.enabled_mask |= 1llu << slot;
        } else {
                pipe_sampler_view_reference(&views->views[slot], NULL);
-               views->desc_data[slot] = null_descriptor;
+               memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
                views->desc.enabled_mask &= ~(1llu << slot);
        }
 
-       views->desc.dirty_mask |= 1llu << slot;
+       views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -423,22 +335,15 @@ static void si_set_sampler_views(struct pipe_context *ctx,
                                            NULL, NULL);
                }
        }
-
-       si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-       struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-       si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
                                           struct si_sampler_states *states)
 {
+       if (!states->desc.buffer)
+               return;
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -460,64 +365,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
        for (i = 0; i < count; i++) {
                unsigned slot = start + i;
 
-               if (!sstates[i]) {
-                       samplers->desc.dirty_mask &= ~(1llu << slot);
+               if (!sstates[i])
                        continue;
-               }
 
-               samplers->desc_data[slot] = sstates[i]->val;
-               samplers->desc.dirty_mask |= 1llu << slot;
+               memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+               samplers->desc.list_dirty = true;
        }
-
-       si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-       struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-       si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-                                    struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
                                     unsigned num_buffers,
                                     unsigned shader_userdata_index,
                                     enum radeon_bo_usage shader_usage,
                                     enum radeon_bo_priority priority)
 {
-       int i;
-
-       buffers->num_buffers = num_buffers;
        buffers->shader_usage = shader_usage;
        buffers->priority = priority;
        buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-       buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 
-       /* si_emit_descriptors only accepts an array of arrays.
-        * This adds such an array. */
-       buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-       for (i = 0; i < num_buffers; i++) {
-               buffers->desc_data[i] = &buffers->desc_storage[i*4];
-       }
-
-       si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
-                           num_buffers, si_emit_buffer_resources);
+       si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+                           num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
        int i;
 
-       for (i = 0; i < buffers->num_buffers; i++) {
+       for (i = 0; i < buffers->desc.num_elements; i++) {
                pipe_resource_reference(&buffers->buffers[i], NULL);
        }
 
        FREE(buffers->buffers);
-       FREE(buffers->desc_storage);
-       FREE(buffers->desc_data);
        si_release_descriptors(&buffers->desc);
 }
 
@@ -535,6 +415,8 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
                                      buffers->shader_usage, buffers->priority);
        }
 
+       if (!buffers->desc.buffer)
+               return;
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                              buffers->desc.buffer, RADEON_USAGE_READWRITE,
                              RADEON_PRIO_SHADER_DATA);
@@ -560,12 +442,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
                                      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
                                      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
        }
+
+       if (!desc->buffer)
+               return;
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                              desc->buffer, RADEON_USAGE_READ,
                              RADEON_PRIO_SHADER_DATA);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
        struct si_descriptors *desc = &sctx->vertex_buffers;
        bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -573,8 +458,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
        uint64_t va;
        uint32_t *ptr;
 
+       if (!sctx->vertex_buffers_dirty)
+               return true;
        if (!count || !sctx->vertex_elements)
-               return;
+               return true;
 
        /* Vertex buffer descriptors are the only ones which are uploaded
         * directly through a staging buffer and don't go through
@@ -582,13 +469,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
         */
        u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
                       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+       if (!desc->buffer)
+               return false;
 
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                              desc->buffer, RADEON_USAGE_READ,
                              RADEON_PRIO_SHADER_DATA);
 
        assert(count <= SI_NUM_VERTEX_BUFFERS);
-       assert(desc->current_context_id == 0);
 
        for (i = 0; i < count; i++) {
                struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -640,6 +528,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
         * cache is needed. */
        desc->pointer_dirty = true;
        sctx->shader_userdata.atom.dirty = true;
+       sctx->vertex_buffers_dirty = false;
+       return true;
 }
 
 
@@ -664,7 +554,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
        if (shader >= SI_NUM_SHADERS)
                return;
 
-       assert(slot < buffers->num_buffers);
+       assert(slot < buffers->desc.num_elements);
        pipe_resource_reference(&buffers->buffers[slot], NULL);
 
        /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -691,7 +581,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
                }
 
                /* Set the descriptor. */
-               uint32_t *desc = buffers->desc_data[slot];
+               uint32_t *desc = buffers->desc.list + slot*4;
                desc[0] = va;
                desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
                          S_008F04_STRIDE(0);
@@ -710,12 +600,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
                buffers->desc.enabled_mask |= 1llu << slot;
        } else {
                /* Clear the descriptor. */
-               memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+               memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
                buffers->desc.enabled_mask &= ~(1llu << slot);
        }
 
-       buffers->desc.dirty_mask |= 1llu << slot;
-       si_update_descriptors(sctx, &buffers->desc);
+       buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -735,7 +624,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
        /* The stride field in the resource descriptor has 14 bits */
        assert(stride < (1 << 14));
 
-       assert(slot < buffers->num_buffers);
+       assert(slot < buffers->desc.num_elements);
        pipe_resource_reference(&buffers->buffers[slot], NULL);
 
        if (buffer) {
@@ -780,7 +669,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
                }
 
                /* Set the descriptor. */
-               uint32_t *desc = buffers->desc_data[slot];
+               uint32_t *desc = buffers->desc.list + slot*4;
                desc[0] = va;
                desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
                          S_008F04_STRIDE(stride) |
@@ -803,12 +692,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
                buffers->desc.enabled_mask |= 1llu << slot;
        } else {
                /* Clear the descriptor. */
-               memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+               memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
                buffers->desc.enabled_mask &= ~(1llu << slot);
        }
 
-       buffers->desc.dirty_mask |= 1llu << slot;
-       si_update_descriptors(sctx, &buffers->desc);
+       buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -870,7 +758,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
                        uint64_t va = r600_resource(buffer)->gpu_address;
 
                        /* Set the descriptor. */
-                       uint32_t *desc = buffers->desc_data[bufidx];
+                       uint32_t *desc = buffers->desc.list + bufidx*4;
                        desc[0] = va;
                        desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
                        desc[2] = 0xffffffff;
@@ -888,24 +776,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
                        buffers->desc.enabled_mask |= 1llu << bufidx;
                } else {
                        /* Clear the descriptor and unset the resource. */
-                       memset(buffers->desc_data[bufidx], 0,
+                       memset(buffers->desc.list + bufidx*4, 0,
                               sizeof(uint32_t) * 4);
                        pipe_resource_reference(&buffers->buffers[bufidx],
                                                NULL);
                        buffers->desc.enabled_mask &= ~(1llu << bufidx);
                }
-               buffers->desc.dirty_mask |= 1llu << bufidx;
        }
        for (; i < old_num_targets; i++) {
                bufidx = SI_SO_BUF_OFFSET + i;
                /* Clear the descriptor and unset the resource. */
-               memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+               memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
                pipe_resource_reference(&buffers->buffers[bufidx], NULL);
                buffers->desc.enabled_mask &= ~(1llu << bufidx);
-               buffers->desc.dirty_mask |= 1llu << bufidx;
        }
 
-       si_update_descriptors(sctx, &buffers->desc);
+       buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -974,22 +860,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
        /* Read/Write buffers. */
        for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
                struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-               bool found = false;
                uint64_t mask = buffers->desc.enabled_mask;
 
                while (mask) {
                        i = u_bit_scan64(&mask);
                        if (buffers->buffers[i] == buf) {
-                               si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+                               si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
                                                            old_va, buf);
+                               buffers->desc.list_dirty = true;
 
                                r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                                                      rbuffer, buffers->shader_usage,
                                                      buffers->priority);
 
-                               buffers->desc.dirty_mask |= 1llu << i;
-                               found = true;
-
                                if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
                                        /* Update the streamout state. */
                                        if (sctx->b.streamout.begin_emitted) {
@@ -1001,34 +884,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
                                }
                        }
                }
-               if (found) {
-                       si_update_descriptors(sctx, &buffers->desc);
-               }
        }
 
        /* Constant buffers. */
        for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
                struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-               bool found = false;
                uint64_t mask = buffers->desc.enabled_mask;
 
                while (mask) {
                        unsigned i = u_bit_scan64(&mask);
                        if (buffers->buffers[i] == buf) {
-                               si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+                               si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
                                                            old_va, buf);
+                               buffers->desc.list_dirty = true;
 
                                r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                                                      rbuffer, buffers->shader_usage,
                                                      buffers->priority);
-
-                               buffers->desc.dirty_mask |= 1llu << i;
-                               found = true;
                        }
                }
-               if (found) {
-                       si_update_descriptors(sctx, &buffers->desc);
-               }
        }
 
        /* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1040,23 +914,20 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
        /* Texture buffers - update bindings. */
        for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
                struct si_sampler_views *views = &sctx->samplers[shader].views;
-               bool found = false;
                uint64_t mask = views->desc.enabled_mask;
 
                while (mask) {
                        unsigned i = u_bit_scan64(&mask);
                        if (views->views[i]->texture == buf) {
+                               si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+                                                           old_va, buf);
+                               views->desc.list_dirty = true;
+
                                r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
                                                      rbuffer, RADEON_USAGE_READ,
                                                      RADEON_PRIO_SHADER_BUFFER_RO);
-
-                               views->desc.dirty_mask |= 1llu << i;
-                               found = true;
                        }
                }
-               if (found) {
-                       si_update_descriptors(sctx, &views->desc);
-               }
        }
 }
 
@@ -1297,11 +1168,10 @@ static void si_emit_shader_pointer(struct si_context *sctx,
        struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
        uint64_t va;
 
-       if (!desc->pointer_dirty)
+       if (!desc->pointer_dirty || !desc->buffer)
                return;
 
        va = desc->buffer->gpu_address +
-            desc->current_context_id * desc->context_size +
             desc->buffer_offset;
 
        radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
@@ -1351,34 +1221,28 @@ static void si_emit_shader_userdata(struct si_context *sctx,
        si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
 
-/* INIT/DEINIT */
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
        int i;
 
        for (i = 0; i < SI_NUM_SHADERS; i++) {
-               si_init_buffer_resources(sctx, &sctx->const_buffers[i],
+               si_init_buffer_resources(&sctx->const_buffers[i],
                                         SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
                                         RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-               si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
+               si_init_buffer_resources(&sctx->rw_buffers[i],
                                         SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
                                         RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-               si_init_sampler_views(sctx, &sctx->samplers[i].views);
-
-               si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-                                   SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
-                                   si_emit_sampler_states);
-
-               sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-               sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-               sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-               sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+               si_init_descriptors(&sctx->samplers[i].views.desc,
+                                   SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+               si_init_descriptors(&sctx->samplers[i].states.desc,
+                                   SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
        }
 
-       si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
-                           4, SI_NUM_VERTEX_BUFFERS, NULL);
+       si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+                           4, SI_NUM_VERTEX_BUFFERS);
 
        /* Set pipe_context functions. */
        sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1401,6 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx)
        si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+       int i;
+
+       for (i = 0; i < SI_NUM_SHADERS; i++) {
+               if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+                   !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+                   !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+                   !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+                       return false;
+       }
+       return si_upload_vertex_buffer_descriptors(sctx);
+}
+
 void si_release_all_descriptors(struct si_context *sctx)
 {
        int i;
index 7b2263b..28cb4e9 100644 (file)
@@ -142,12 +142,6 @@ struct si_context {
        union {
                struct {
                        /* The order matters. */
-                       struct r600_atom *const_buffers[SI_NUM_SHADERS];
-                       struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-                       struct r600_atom *sampler_views[SI_NUM_SHADERS];
-                       struct r600_atom *sampler_states[SI_NUM_SHADERS];
-                       /* Caches must be flushed after resource descriptors are
-                        * updated in memory. */
                        struct r600_atom *cache_flush;
                        struct r600_atom *streamout_begin;
                        struct r600_atom *streamout_enable; /* must be after streamout_begin */
index e4d859a..e6bacdf 100644 (file)
@@ -158,60 +158,48 @@ struct si_shader_data {
 #define SI_NUM_VERTEX_BUFFERS  16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-       struct r600_atom atom;
-
-       /* The size of one resource descriptor. */
+       /* The list of descriptors in malloc'd memory. */
+       uint32_t *list;
+       /* The size of one descriptor. */
        unsigned element_dw_size;
-       /* The maximum number of resource descriptors. */
+       /* The maximum number of descriptors. */
        unsigned num_elements;
+       /* Whether the list has been changed and should be re-uploaded. */
+       bool list_dirty;
 
-       /* The buffer where resource descriptors are stored. */
+       /* The buffer where the descriptors have been uploaded. */
        struct r600_resource *buffer;
        unsigned buffer_offset;
 
-       /* The i-th bit is set if that element is dirty (changed but not emitted). */
-       uint64_t dirty_mask;
        /* The i-th bit is set if that element is enabled (non-NULL resource). */
        uint64_t enabled_mask;
 
-       /* We can't update descriptors directly because the GPU might be
-        * reading them at the same time, so we have to update them
-        * in a copy-on-write manner. Each such copy is called a context,
-        * which is just another array descriptors in the same buffer. */
-       unsigned current_context_id;
-       /* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-       unsigned context_size;
-
        /* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
         * array will be stored. */
        unsigned shader_userdata_offset;
+       /* Whether the pointer should be re-emitted. */
        bool pointer_dirty;
 };
 
 struct si_sampler_views {
        struct si_descriptors           desc;
        struct pipe_sampler_view        *views[SI_NUM_SAMPLER_VIEWS];
-       uint32_t                        *desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
        struct si_descriptors           desc;
-       uint32_t                        *desc_data[SI_NUM_SAMPLER_STATES];
        void                            *saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
        struct si_descriptors           desc;
-       unsigned                        num_buffers;
        enum radeon_bo_usage            shader_usage; /* READ, WRITE, or READWRITE */
        enum radeon_bo_priority         priority;
        struct pipe_resource            **buffers; /* this has num_buffers elements */
-       uint32_t                        *desc_storage; /* this has num_buffers*4 elements */
-       uint32_t                        **desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -247,13 +235,13 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
                                unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
                        struct pipe_resource *buffer,
                        unsigned stride, unsigned num_records,
                        bool add_tid, bool swizzle,
                        unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_copy_buffer(struct si_context *sctx,
index ec8dd84..e8faf40 100644 (file)
@@ -743,11 +743,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                sctx->current_rast_prim = info->mode;
 
        si_update_shaders(sctx);
-
-       if (sctx->vertex_buffers_dirty) {
-               si_update_vertex_buffers(sctx);
-               sctx->vertex_buffers_dirty = false;
-       }
+       if (!si_upload_shader_descriptors(sctx))
+               return;
 
        if (info->indexed) {
                /* Initialize the index buffer struct. */