gallium,util: Pull u_indices and u_primconvert back into gallium
[platform/upstream/mesa.git] / src / mesa / vbo / vbo_save_api.c
index e2fd009..a41d518 100644 (file)
@@ -64,6 +64,39 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * This could be improved to fallback only when a mix of EvalCoord and
  * Vertex commands are issued within a single primitive.
+ *
+ * The compilation process works as follows. All vertex attributes
+ * except position are copied to vbo_save_context::attrptr (see ATTR_UNION).
+ * 'attrptr' are pointers to vbo_save_context::vertex ordered according to the enabled
+ * attributes (se upgrade_vertex).
+ * When the position attribute is received, all the attributes are then 
+ * copied to the vertex_store (see the end of ATTR_UNION).
+ * The vertex_store is simply an extensible float array.
+ * When the vertex list needs to be compiled (see compile_vertex_list),
+ * several transformations are performed:
+ *   - some primitives are merged together (eg: two consecutive GL_TRIANGLES
+ * with 3 vertices can be merged in a single GL_TRIANGLES with 6 vertices).
+ *   - an index buffer is built.
+ *   - identical vertices are detected and only one is kept.
+ * At the end of this transformation, the index buffer and the vertex buffer
+ * are uploaded in vRAM in the same buffer object.
+ * This buffer object is shared between multiple display list to allow
+ * draw calls merging later.
+ *
+ * The layout of this buffer for two display lists is:
+ *    V0A0|V0A1|V1A0|V1A1|P0I0|P0I1|V0A0V0A1V0A2|V1A1V1A1V1A2|...
+ *                                 ` new list starts
+ *        - VxAy: vertex x, attributes y
+ *        - PxIy: draw x, index y
+ *
+ * To allow draw call merging, display list must use the same VAO, including
+ * the same Offset in the buffer object. To achieve this, the start values of
+ * the primitive are shifted and the indices adjusted (see offset_diff and
+ * start_offset in compile_vertex_list).
+ *
+ * Display list using the loopback code (see vbo_save_playback_vertex_list_loopback),
+ * can't be drawn with an index buffer so this transformation is disabled
+ * in this case.
  */
 
 
@@ -77,19 +110,20 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/draw_validate.h"
 #include "main/api_arrayelt.h"
-#include "main/vtxfmt.h"
 #include "main/dispatch.h"
 #include "main/state.h"
 #include "main/varray.h"
 #include "util/bitscan.h"
 #include "util/u_memory.h"
 #include "util/hash_table.h"
+#include "gallium/auxiliary/indices/u_indices.h"
+#include "util/u_prim.h"
 
 #include "gallium/include/pipe/p_state.h"
 
-#include "vbo_noop.h"
 #include "vbo_private.h"
-
+#include "api_exec_decl.h"
+#include "api_save.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -119,53 +153,21 @@ copy_vertices(struct gl_context *ctx,
    struct vbo_save_context *save = &vbo_context(ctx)->save;
    struct _mesa_prim *prim = &node->cold->prims[node->cold->prim_count - 1];
    GLuint sz = save->vertex_size;
-   const fi_type *src = src_buffer + prim->start * sz;
-   fi_type *dst = save->copied.buffer;
 
-   if (prim->end)
+   if (prim->end || !prim->count || !sz)
       return 0;
 
-   return vbo_copy_vertices(ctx, prim->mode, prim->start, &prim->count,
-                            prim->begin, sz, true, dst, src);
-}
-
-
-static struct vbo_save_vertex_store *
-alloc_vertex_store(struct gl_context *ctx, int vertex_count)
-{
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-   struct vbo_save_vertex_store *vertex_store =
-      CALLOC_STRUCT(vbo_save_vertex_store);
-
-   int size = MAX2(vertex_count * save->vertex_size, VBO_SAVE_BUFFER_SIZE);
-
-   /* obj->Name needs to be non-zero, but won't ever be examined more
-    * closely than that.  In particular these buffers won't be entered
-    * into the hash and can never be confused with ones visible to the
-    * user.  Perhaps there could be a special number for internal
-    * buffers:
-    */
-   vertex_store->buffer_in_ram_size = size * sizeof(GLfloat);
-   vertex_store->buffer_in_ram = malloc(vertex_store->buffer_in_ram_size);
-   save->out_of_memory = vertex_store->buffer_in_ram == NULL;
+   const fi_type *src = src_buffer + prim->start * sz;
+   assert(save->copied.buffer == NULL);
+   save->copied.buffer = malloc(sizeof(fi_type) * sz * prim->count);
 
-   if (save->out_of_memory) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "internal VBO allocation");
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
+   unsigned r = vbo_copy_vertices(ctx, prim->mode, prim->start, &prim->count,
+                                  prim->begin, sz, true, save->copied.buffer, src);
+   if (!r) {
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
    }
-
-   vertex_store->used = 0;
-
-   return vertex_store;
-}
-
-
-static void
-free_vertex_store(struct gl_context *ctx,
-                  struct vbo_save_vertex_store *vertex_store)
-{
-   free(vertex_store->buffer_in_ram);
-   free(vertex_store);
+   return r;
 }
 
 
@@ -174,11 +176,13 @@ realloc_prim_store(struct vbo_save_primitive_store *store, int prim_count)
 {
    if (store == NULL)
       store = CALLOC_STRUCT(vbo_save_primitive_store);
+
    uint32_t old_size = store->size;
-   store->size = MAX3(store->size, prim_count, VBO_SAVE_PRIM_SIZE);
+   store->size = prim_count;
+   assert (old_size < store->size);
    store->prims = realloc(store->prims, store->size * sizeof(struct _mesa_prim));
    memset(&store->prims[old_size], 0, (store->size - old_size) * sizeof(struct _mesa_prim));
-   store->used = 0;
+
    return store;
 }
 
@@ -188,18 +192,8 @@ reset_counters(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   save->buffer_map = save->vertex_store->buffer_in_ram + save->vertex_store->used;
-
-   assert(save->buffer_map == save->buffer_ptr);
-
-   if (save->vertex_size)
-      save->max_vert = (save->vertex_store->buffer_in_ram_size / sizeof(float) - save->vertex_store->used) /
-                        save->vertex_size;
-   else
-      save->max_vert = 0;
-
+   save->vertex_store->used = 0;
    save->prim_store->used = 0;
-   save->vert_count = 0;
    save->dangling_attr_ref = GL_FALSE;
 }
 
@@ -263,16 +257,14 @@ convert_line_loop_to_strip(struct vbo_save_context *save,
        */
       const GLuint sz = save->vertex_size;
       /* 0th vertex: */
-      const fi_type *src = save->buffer_map + prim->start * sz;
+      const fi_type *src = save->vertex_store->buffer_in_ram + prim->start * sz;
       /* end of buffer: */
-      fi_type *dst = save->buffer_map + (prim->start + prim->count) * sz;
+      fi_type *dst = save->vertex_store->buffer_in_ram + (prim->start + prim->count) * sz;
 
       memcpy(dst, src, sz * sizeof(float));
 
       prim->count++;
       node->cold->vertex_count++;
-      save->vert_count++;
-      save->buffer_ptr += sz;
       save->vertex_store->used += sz;
    }
 
@@ -400,29 +392,33 @@ update_vao(struct gl_context *ctx,
    _mesa_set_vao_immutable(ctx, *vao);
 }
 
+static void wrap_filled_vertex(struct gl_context *ctx);
 
+/* Grow the vertex storage to accomodate for vertex_count new vertices */
 static void
-realloc_storage(struct gl_context *ctx, int prim_count, int vertex_count)
+grow_vertex_storage(struct gl_context *ctx, int vertex_count)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   if (vertex_count >= 0) {
-      /* Release old reference:
-       */
-      free_vertex_store(ctx, save->vertex_store);
-      save->vertex_store = NULL;
-      /* When we have a new vbo, we will for sure need a new vao */
-      for (gl_vertex_processing_mode vpm = 0; vpm < VP_MODE_MAX; ++vpm)
-         _mesa_reference_vao(ctx, &save->VAO[vpm], NULL);
+   assert (save->vertex_store);
 
-      /* Allocate and map new store:
-       */
-      save->vertex_store = alloc_vertex_store(ctx, vertex_count);
-      save->buffer_ptr = save->vertex_store->buffer_in_ram + save->vertex_store->used;
-      save->out_of_memory = save->buffer_ptr == NULL;
+   int new_size = (save->vertex_store->used +
+                   vertex_count * save->vertex_size) * sizeof(GLfloat);
+
+   /* Limit how much memory we allocate. */
+   if (save->prim_store->used > 0 &&
+       vertex_count > 0 &&
+       new_size > VBO_SAVE_BUFFER_SIZE) {
+      wrap_filled_vertex(ctx);
+      new_size = VBO_SAVE_BUFFER_SIZE;
    }
 
-   if (prim_count >= 0)
-      save->prim_store = realloc_prim_store(save->prim_store, prim_count);
+   if (new_size > save->vertex_store->buffer_in_ram_size) {
+      save->vertex_store->buffer_in_ram_size = new_size;
+      save->vertex_store->buffer_in_ram = realloc(save->vertex_store->buffer_in_ram,
+                                                  save->vertex_store->buffer_in_ram_size);
+      if (save->vertex_store->buffer_in_ram == NULL)
+         save->out_of_memory = true;
+   }
 }
 
 struct vertex_key {
@@ -466,7 +462,7 @@ add_vertex(struct vbo_save_context *save, struct hash_table *hash_to_index,
    if (!hash_to_index)
       return index;
 
-   fi_type *vert = save->buffer_map + save->vertex_size * index;
+   fi_type *vert = save->vertex_store->buffer_in_ram + save->vertex_size * index;
 
    struct vertex_key *key = malloc(sizeof(struct vertex_key));
    key->vertex_size = save->vertex_size;
@@ -499,6 +495,15 @@ add_vertex(struct vbo_save_context *save, struct hash_table *hash_to_index,
 }
 
 
+static uint32_t
+get_vertex_count(struct vbo_save_context *save)
+{
+   if (!save->vertex_size)
+      return 0;
+   return save->vertex_store->used / save->vertex_size;
+}
+
+
 /**
  * Insert the active immediate struct onto the display list currently
  * being built.
@@ -518,7 +523,6 @@ compile_vertex_list(struct gl_context *ctx)
    if (!node)
       return;
 
-   memset(node, 0, sizeof(struct vbo_save_vertex_list));
    node->cold = calloc(1, sizeof(*node->cold));
 
    /* Make sure the pointer is aligned to the size of a pointer */
@@ -526,7 +530,7 @@ compile_vertex_list(struct gl_context *ctx)
 
    const GLsizei stride = save->vertex_size*sizeof(GLfloat);
 
-   node->cold->vertex_count = save->vert_count;
+   node->cold->vertex_count = get_vertex_count(save);
    node->cold->wrap_count = save->copied.nr;
    node->cold->prims = malloc(sizeof(struct _mesa_prim) * save->prim_store->used);
    memcpy(node->cold->prims, save->prim_store->prims, sizeof(struct _mesa_prim) * save->prim_store->used);
@@ -543,7 +547,7 @@ compile_vertex_list(struct gl_context *ctx)
       if (current_size) {
          node->cold->current_data = malloc(current_size * sizeof(GLfloat));
          if (node->cold->current_data) {
-            const char *buffer = (const char *)save->buffer_map;
+            const char *buffer = (const char *)save->vertex_store->buffer_in_ram;
             unsigned attr_offset = save->attrsz[0] * sizeof(GLfloat);
             unsigned vertex_offset = 0;
 
@@ -554,6 +558,7 @@ compile_vertex_list(struct gl_context *ctx)
                    current_size * sizeof(GLfloat));
          } else {
             _mesa_error(ctx, GL_OUT_OF_MEMORY, "Current value allocation");
+            save->out_of_memory = true;
          }
       }
    }
@@ -563,11 +568,9 @@ compile_vertex_list(struct gl_context *ctx)
    if (save->dangling_attr_ref)
       ctx->ListState.Current.UseLoopback = true;
 
-   save->vertex_store->used += save->vertex_size * node->cold->vertex_count;
-
    /* Copy duplicated vertices
     */
-   save->copied.nr = copy_vertices(ctx, node, save->buffer_map);
+   save->copied.nr = copy_vertices(ctx, node, save->vertex_store->buffer_in_ram);
 
    if (node->cold->prims[node->cold->prim_count - 1].mode == GL_LINE_LOOP) {
       convert_line_loop_to_strip(save, node);
@@ -580,7 +583,7 @@ compile_vertex_list(struct gl_context *ctx)
 
    /* Create an index buffer. */
    node->cold->min_index = node->cold->max_index = 0;
-   if (save->vert_count == 0 || node->cold->prim_count == 0)
+   if (node->cold->vertex_count == 0 || node->cold->prim_count == 0)
       goto end;
 
    /* We won't modify node->prims, so use a const alias to avoid unintended
@@ -594,13 +597,11 @@ compile_vertex_list(struct gl_context *ctx)
    node->cold->min_index = node->cold->prims[0].start;
    node->cold->max_index = end - 1;
 
-   /* Estimate for the worst case: all prims are line strips (the +1 is because
-    * wrap_buffers may call use but the last primitive may not be complete) */
-   int max_indices_count = MAX2(total_vert_count * 2 - (node->cold->prim_count * 2) + 1,
-                                total_vert_count);
-
-   int size = max_indices_count * sizeof(uint32_t);
-   uint32_t* indices = (uint32_t*) malloc(size);
+   /* converting primitive types may result in many more indices */
+   bool all_prims_supported = (ctx->Const.DriverSupportedPrimMask & BITFIELD_MASK(PIPE_PRIM_MAX)) == BITFIELD_MASK(PIPE_PRIM_MAX);
+   int max_index_count = total_vert_count * (all_prims_supported ? 2 : 3);
+   uint32_t* indices = (uint32_t*) malloc(max_index_count * sizeof(uint32_t));
+   void *tmp_indices = all_prims_supported ? NULL : malloc(max_index_count * sizeof(uint32_t));
    struct _mesa_prim *merged_prims = NULL;
 
    int idx = 0;
@@ -622,16 +623,41 @@ compile_vertex_list(struct gl_context *ctx)
    for (unsigned i = 0; i < node->cold->prim_count; i++) {
       assert(original_prims[i].basevertex == 0);
       GLubyte mode = original_prims[i].mode;
+      bool converted_prim = false;
+      unsigned index_size;
 
       int vertex_count = original_prims[i].count;
       if (!vertex_count) {
          continue;
       }
 
+      /* Increase indices storage if the original estimation was too small. */
+      if (idx + 3 * vertex_count > max_index_count) {
+         max_index_count = max_index_count + 3 * vertex_count;
+         indices = (uint32_t*) realloc(indices, max_index_count * sizeof(uint32_t));
+         tmp_indices = all_prims_supported ? NULL : realloc(tmp_indices, max_index_count * sizeof(uint32_t));
+      }
+
       /* Line strips may get converted to lines */
       if (mode == GL_LINE_STRIP)
          mode = GL_LINES;
 
+      if (!(ctx->Const.DriverSupportedPrimMask & BITFIELD_BIT(mode))) {
+         unsigned new_count;
+         u_generate_func trans_func;
+         enum pipe_prim_type pmode = (enum pipe_prim_type)mode;
+         u_index_generator(ctx->Const.DriverSupportedPrimMask,
+                           pmode, original_prims[i].start, vertex_count,
+                           PV_LAST, PV_LAST,
+                           &pmode, &index_size, &new_count,
+                           &trans_func);
+         if (new_count > 0)
+            trans_func(original_prims[i].start, new_count, tmp_indices);
+         vertex_count = new_count;
+         mode = (GLubyte)pmode;
+         converted_prim = true;
+      }
+
       /* If 2 consecutive prims use the same mode => merge them. */
       bool merge_prims = last_valid_prim >= 0 &&
                          mode == merged_prims[last_valid_prim].mode &&
@@ -639,6 +665,8 @@ compile_vertex_list(struct gl_context *ctx)
                          mode != GL_QUAD_STRIP && mode != GL_POLYGON &&
                          mode != GL_PATCHES;
 
+/* index generation uses uint16_t if the index count is small enough */
+#define CAST_INDEX(BASE, SIZE, IDX) ((SIZE == 2 ? (uint32_t)(((uint16_t*)BASE)[IDX]) : ((uint32_t*)BASE)[IDX]))
       /* To be able to merge consecutive triangle strips we need to insert
        * a degenerate triangle.
        */
@@ -649,14 +677,16 @@ compile_vertex_list(struct gl_context *ctx)
          unsigned tri_count = merged_prims[last_valid_prim].count - 2;
 
          indices[idx] = indices[idx - 1];
-         indices[idx + 1] = add_vertex(save, vertex_to_index, original_prims[i].start,
+         indices[idx + 1] = add_vertex(save, vertex_to_index,
+                                       converted_prim ? CAST_INDEX(tmp_indices, index_size, 0) : original_prims[i].start,
                                        temp_vertices_buffer, &max_index);
          idx += 2;
          merged_prims[last_valid_prim].count += 2;
 
          if (tri_count % 2) {
             /* Add another index to preserve winding order */
-            indices[idx++] = add_vertex(save, vertex_to_index, original_prims[i].start,
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, 0) : original_prims[i].start,
                                         temp_vertices_buffer, &max_index);
             merged_prims[last_valid_prim].count++;
          }
@@ -674,24 +704,40 @@ compile_vertex_list(struct gl_context *ctx)
             (original_prims[i + 1].mode == GL_LINE_STRIP ||
              original_prims[i + 1].mode == GL_LINES)))) {
          for (unsigned j = 0; j < vertex_count; j++) {
-            indices[idx++] = add_vertex(save, vertex_to_index, original_prims[i].start + j,
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
                                         temp_vertices_buffer, &max_index);
             /* Repeat all but the first/last indices. */
             if (j && j != vertex_count - 1) {
-               indices[idx++] = add_vertex(save, vertex_to_index, original_prims[i].start + j,
+               indices[idx++] = add_vertex(save, vertex_to_index,
+                                           converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
                                            temp_vertices_buffer, &max_index);
             }
          }
       } else {
          /* We didn't convert to LINES, so restore the original mode */
-         mode = original_prims[i].mode;
+         if (!converted_prim)
+            mode = original_prims[i].mode;
 
          for (unsigned j = 0; j < vertex_count; j++) {
-            indices[idx++] = add_vertex(save, vertex_to_index, original_prims[i].start + j,
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
                                         temp_vertices_buffer, &max_index);
          }
       }
 
+      /* Duplicate the last vertex for incomplete primitives */
+      if (vertex_count > 0) {
+         unsigned min_vert = u_prim_vertex_count(mode)->min;
+         for (unsigned j = vertex_count; j < min_vert; j++) {
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                       converted_prim ? CAST_INDEX(tmp_indices, index_size, vertex_count - 1) :
+                                                         original_prims[i].start + vertex_count - 1,
+                                       temp_vertices_buffer, &max_index);
+         }
+      }
+
+#undef CAST_INDEX
       if (merge_prims) {
          /* Update vertex count. */
          merged_prims[last_valid_prim].count += idx - start;
@@ -705,9 +751,10 @@ compile_vertex_list(struct gl_context *ctx)
          merged_prims[last_valid_prim].count = idx - start;
       }
       merged_prims[last_valid_prim].mode = mode;
-   }
 
-   assert(idx > 0 && idx <= max_indices_count);
+      /* converted prims will filter incomplete primitives and may have no indices */
+      assert((idx > 0 || converted_prim) && idx <= max_index_count);
+   }
 
    unsigned merged_prim_count = last_valid_prim + 1;
    node->cold->ib.ptr = NULL;
@@ -737,16 +784,18 @@ compile_vertex_list(struct gl_context *ctx)
    if (total_bytes_needed > available_bytes) {
       if (save->current_bo)
          _mesa_reference_buffer_object(ctx, &save->current_bo, NULL);
-      save->current_bo = ctx->Driver.NewBufferObject(ctx, VBO_BUF_ID + 1);
-      bool success = ctx->Driver.BufferData(ctx,
-                                            GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                            MAX2(total_bytes_needed, VBO_SAVE_BUFFER_SIZE * sizeof(uint32_t)),
-                                            NULL,
-                                            GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT,
-                                            save->current_bo);
+      save->current_bo = _mesa_bufferobj_alloc(ctx, VBO_BUF_ID + 1);
+      bool success = _mesa_bufferobj_data(ctx,
+                                          GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                          MAX2(total_bytes_needed, VBO_SAVE_BUFFER_SIZE),
+                                          NULL,
+                                          GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT |
+                                          MESA_GALLIUM_VERTEX_STATE_STORAGE,
+                                          save->current_bo);
       if (!success) {
          _mesa_reference_buffer_object(ctx, &save->current_bo, NULL);
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "IB allocation");
+         save->out_of_memory = true;
       } else {
          save->current_bo_bytes_used = 0;
          available_bytes = save->current_bo->Size;
@@ -791,33 +840,36 @@ compile_vertex_list(struct gl_context *ctx)
    _mesa_reference_buffer_object(ctx, &node->cold->ib.obj, save->current_bo);
 
    /* Upload the vertices first (see buffer_offset) */
-   ctx->Driver.BufferSubData(ctx,
-                             save->current_bo_bytes_used,
-                             total_vert_count * save->vertex_size * sizeof(fi_type),
-                             vertex_to_index ? temp_vertices_buffer : save->buffer_map,
-                             node->cold->ib.obj);
+   _mesa_bufferobj_subdata(ctx,
+                           save->current_bo_bytes_used,
+                           total_vert_count * save->vertex_size * sizeof(fi_type),
+                           vertex_to_index ? temp_vertices_buffer : save->vertex_store->buffer_in_ram,
+                           node->cold->ib.obj);
    save->current_bo_bytes_used += total_vert_count * save->vertex_size * sizeof(fi_type);
+   node->cold->bo_bytes_used = save->current_bo_bytes_used;
 
   if (vertex_to_index) {
       _mesa_hash_table_destroy(vertex_to_index, _free_entry);
       free(temp_vertices_buffer);
    }
 
-   /* Since we're append the indices to an existing buffer, we need to adjust the start value of each
+   /* Since we append the indices to an existing buffer, we need to adjust the start value of each
     * primitive (not the indices themselves). */
-   save->current_bo_bytes_used += align(save->current_bo_bytes_used, 4) - save->current_bo_bytes_used;
-   int indices_offset = save->current_bo_bytes_used / 4;
-   for (int i = 0; i < merged_prim_count; i++) {
-      merged_prims[i].start += indices_offset;
+   if (!ctx->ListState.Current.UseLoopback) {
+      save->current_bo_bytes_used += align(save->current_bo_bytes_used, 4) - save->current_bo_bytes_used;
+      int indices_offset = save->current_bo_bytes_used / 4;
+      for (int i = 0; i < merged_prim_count; i++) {
+         merged_prims[i].start += indices_offset;
+      }
    }
 
    /* Then upload the indices. */
    if (node->cold->ib.obj) {
-      ctx->Driver.BufferSubData(ctx,
-                                save->current_bo_bytes_used,
-                                idx * sizeof(uint32_t),
-                                indices,
-                                node->cold->ib.obj);
+      _mesa_bufferobj_subdata(ctx,
+                              save->current_bo_bytes_used,
+                              idx * sizeof(uint32_t),
+                              indices,
+                              node->cold->ib.obj);
       save->current_bo_bytes_used += idx * sizeof(uint32_t);
    } else {
       node->cold->vertex_count = 0;
@@ -825,54 +877,59 @@ compile_vertex_list(struct gl_context *ctx)
    }
 
    /* Prepare for DrawGallium */
-   memset(&node->merged.info, 0, sizeof(struct pipe_draw_info));
+   memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
    /* The other info fields will be updated in vbo_save_playback_vertex_list */
-   node->merged.info.index_size = 4;
-   node->merged.info.instance_count = 1;
-   node->merged.info.index.gl_bo = node->cold->ib.obj;
+   node->cold->info.index_size = 4;
+   node->cold->info.instance_count = 1;
+   node->cold->info.index.gl_bo = node->cold->ib.obj;
    if (merged_prim_count == 1) {
-      node->merged.info.mode = merged_prims[0].mode;
-      node->merged.start_count.start = merged_prims[0].start;
-      node->merged.start_count.count = merged_prims[0].count;
-      node->merged.start_count.index_bias = 0;
-      node->merged.mode = NULL;
+      node->cold->info.mode = merged_prims[0].mode;
+      node->start_count.start = merged_prims[0].start;
+      node->start_count.count = merged_prims[0].count;
+      node->start_count.index_bias = 0;
+      node->modes = NULL;
    } else {
-      node->merged.mode = malloc(merged_prim_count * sizeof(unsigned char));
-      node->merged.start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
+      node->modes = malloc(merged_prim_count * sizeof(unsigned char));
+      node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
       for (unsigned i = 0; i < merged_prim_count; i++) {
-         node->merged.start_counts[i].start = merged_prims[i].start;
-         node->merged.start_counts[i].count = merged_prims[i].count;
-         node->merged.start_counts[i].index_bias = 0;
-         node->merged.mode[i] = merged_prims[i].mode;
+         node->start_counts[i].start = merged_prims[i].start;
+         node->start_counts[i].count = merged_prims[i].count;
+         node->start_counts[i].index_bias = 0;
+         node->modes[i] = merged_prims[i].mode;
       }
    }
-   node->merged.num_draws = merged_prim_count;
-   if (node->merged.num_draws > 1) {
+   node->num_draws = merged_prim_count;
+   if (node->num_draws > 1) {
       bool same_mode = true;
-      for (unsigned i = 1; i < node->merged.num_draws && same_mode; i++) {
-         same_mode = node->merged.mode[i] == node->merged.mode[0];
+      for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
+         same_mode = node->modes[i] == node->modes[0];
       }
       if (same_mode) {
          /* All primitives use the same mode, so we can simplify a bit */
-         node->merged.info.mode = node->merged.mode[0];
-         free(node->merged.mode);
-         node->merged.mode = NULL;
+         node->cold->info.mode = node->modes[0];
+         free(node->modes);
+         node->modes = NULL;
       }
    }
 
    free(indices);
+   free(tmp_indices);
    free(merged_prims);
 
 end:
+   node->draw_begins = node->cold->prims[0].begin;
 
    if (!save->current_bo) {
-      save->current_bo = ctx->Driver.NewBufferObject(ctx, VBO_BUF_ID + 1);
-      bool success = ctx->Driver.BufferData(ctx,
-                                            GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                            VBO_SAVE_BUFFER_SIZE * sizeof(uint32_t),
-                                            NULL,
-                                            GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT,
-                                            save->current_bo);
+      save->current_bo = _mesa_bufferobj_alloc(ctx, VBO_BUF_ID + 1);
+      bool success = _mesa_bufferobj_data(ctx,
+                                          GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                          VBO_SAVE_BUFFER_SIZE,
+                                          NULL,
+                                          GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT |
+                                          MESA_GALLIUM_VERTEX_STATE_STORAGE,
+                                          save->current_bo);
+      if (!success)
+         save->out_of_memory = true;
    }
 
    GLuint offsets[VBO_ATTRIB_MAX];
@@ -889,55 +946,45 @@ end:
                  save->current_bo, buffer_offset, stride,
                  save->enabled, save->attrsz, save->attrtype, offsets);
       /* Reference the vao in the dlist */
-      node->VAO[vpm] = NULL;
-      _mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
-   }
+      node->cold->VAO[vpm] = NULL;
+      _mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
+   }
+
+   /* Prepare for DrawGalliumVertexState */
+   if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
+      for (unsigned i = 0; i < VP_MODE_MAX; i++) {
+         uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
+                                    node->cold->VAO[i]->_EnabledWithMapMode;
+
+         node->state[i] =
+            ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
+                                                 node->cold->ib.obj,
+                                                 enabled_attribs);
+         node->private_refcount[i] = 0;
+         node->enabled_attribs[i] = enabled_attribs;
+      }
 
+      node->ctx = ctx;
+      node->mode = node->cold->info.mode;
+      assert(node->cold->info.index_size == 4);
+   }
 
    /* Deal with GL_COMPILE_AND_EXECUTE:
     */
    if (ctx->ExecuteFlag) {
-      struct _glapi_table *dispatch = GET_DISPATCH();
-
-      _glapi_set_dispatch(ctx->Exec);
-
       /* _vbo_loopback_vertex_list doesn't use the index buffer, so we have to
-       * use buffer_in_ram instead of current_bo which contains all vertices instead
-       * of the deduplicated vertices only in the !UseLoopback case.
+       * use buffer_in_ram (which contains all vertices) instead of current_bo
+       * (which contains deduplicated vertices *when* UseLoopback is false).
        *
        * The problem is that the VAO offset is based on current_bo's layout,
        * so we have to use a temp value.
        */
-      struct gl_vertex_array_object *vao = node->VAO[VP_MODE_SHADER];
+      struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
       GLintptr original = vao->BufferBinding[0].Offset;
-      if (!ctx->ListState.Current.UseLoopback) {
-         GLintptr new_offset = (save->buffer_map - save->vertex_store->buffer_in_ram) *
-                               sizeof(GLfloat);
-         /* 'start_offset' has been added to all primitives 'start', so undo it here. */
-         new_offset -= start_offset * stride;
-         vao->BufferBinding[0].Offset = new_offset;
-      }
+      /* 'start_offset' has been added to all primitives 'start', so undo it here. */
+      vao->BufferBinding[0].Offset = -(GLintptr)(start_offset * stride);
       _vbo_loopback_vertex_list(ctx, node, save->vertex_store->buffer_in_ram);
       vao->BufferBinding[0].Offset = original;
-
-      _glapi_set_dispatch(dispatch);
-   }
-
-   /* Decide whether the storage structs are full, or can be used for
-    * the next vertex lists as well.
-    */
-   if (save->vertex_store->used >
-       save->vertex_store->buffer_in_ram_size / sizeof(float) - 16 * (save->vertex_size + 4)) {
-      realloc_storage(ctx, -1, 0);
-   }
-   else {
-      /* update buffer_ptr for next vertex */
-      save->buffer_ptr = save->vertex_store->buffer_in_ram
-         + save->vertex_store->used;
-   }
-
-   if (save->prim_store->used > save->prim_store->size - 6) {
-      realloc_storage(ctx, 0, -1);
    }
 
    /* Reset our structures for the next run of vertices:
@@ -963,7 +1010,7 @@ wrap_buffers(struct gl_context *ctx)
 
    /* Close off in-progress primitive.
     */
-   save->prim_store->prims[i].count = (save->vert_count - save->prim_store->prims[i].start);
+   save->prim_store->prims[i].count = (get_vertex_count(save) - save->prim_store->prims[i].start);
    mode = save->prim_store->prims[i].mode;
 
    /* store the copied vertices, and allocate a new list.
@@ -995,16 +1042,22 @@ wrap_filled_vertex(struct gl_context *ctx)
     */
    wrap_buffers(ctx);
 
+   assert(save->vertex_store->used == 0 && save->vertex_store->used == 0);
+
    /* Copy stored stored vertices to start of new list.
     */
-   assert(save->max_vert - save->vert_count > save->copied.nr);
-
    numComponents = save->copied.nr * save->vertex_size;
-   memcpy(save->buffer_ptr,
-          save->copied.buffer,
-          numComponents * sizeof(fi_type));
-   save->buffer_ptr += numComponents;
-   save->vert_count += save->copied.nr;
+
+   fi_type *buffer_ptr = save->vertex_store->buffer_in_ram;
+   if (numComponents) {
+      assert(save->copied.buffer);
+      memcpy(buffer_ptr,
+             save->copied.buffer,
+             numComponents * sizeof(fi_type));
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
+   }
+   save->vertex_store->used = numComponents;
 }
 
 
@@ -1074,7 +1127,7 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
    /* Store the current run of vertices, and emit a GL_END.  Emit a
     * BEGIN in the new buffer.
     */
-   if (save->vert_count)
+   if (save->vertex_store->used)
       wrap_buffers(ctx);
    else
       assert(save->copied.nr == 0);
@@ -1092,10 +1145,6 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
    save->enabled |= BITFIELD64_BIT(attr);
 
    save->vertex_size += newsz - oldsz;
-   save->max_vert = ((save->vertex_store->buffer_in_ram_size / sizeof(float) -
-                      save->vertex_store->used) /
-                     save->vertex_size);
-   save->vert_count = 0;
 
    /* Recalculate all the attrptr[] values:
     */
@@ -1121,8 +1170,10 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
     * and will need fixup at runtime.
     */
    if (save->copied.nr) {
+      assert(save->copied.buffer);
       const fi_type *data = save->copied.buffer;
-      fi_type *dest = save->buffer_map;
+      grow_vertex_storage(ctx, save->copied.nr);
+      fi_type *dest = save->vertex_store->buffer_in_ram;
 
       /* Need to note this and fix up at runtime (or loopback):
        */
@@ -1137,28 +1188,43 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
             const int j = u_bit_scan64(&enabled);
             assert(save->attrsz[j]);
             if (j == attr) {
-               if (oldsz) {
-                  COPY_CLEAN_4V_TYPE_AS_UNION(dest, oldsz, data,
-                                              save->attrtype[j]);
-                  data += oldsz;
-                  dest += newsz;
-               }
-               else {
-                  COPY_SZ_4V(dest, newsz, save->current[attr]);
-                  dest += newsz;
+               int k;
+               const fi_type *src = oldsz ? data : save->current[attr];
+               int copy = oldsz ? oldsz : newsz;
+               for (k = 0; k < copy; k++)
+                  dest[k] = src[k];
+               for (; k < newsz; k++) {
+                  switch (save->attrtype[j]) {
+                     case GL_FLOAT:
+                        dest[k] = FLOAT_AS_UNION(k == 3);
+                        break;
+                     case GL_INT:
+                        dest[k] = INT_AS_UNION(k == 3);
+                        break;
+                     case GL_UNSIGNED_INT:
+                        dest[k] = UINT_AS_UNION(k == 3);
+                        break;
+                     default:
+                        dest[k] = FLOAT_AS_UNION(k == 3);
+                        assert(!"Unexpected type in upgrade_vertex");
+                        break;
+                  }
                }
-            }
-            else {
+               dest += newsz;
+               data += oldsz;
+            } else {
                GLint sz = save->attrsz[j];
-               COPY_SZ_4V(dest, sz, data);
+               for (int k = 0; k < sz; k++)
+                  dest[k] = data[k];
                data += sz;
                dest += sz;
             }
          }
       }
 
-      save->buffer_ptr = dest;
-      save->vert_count += save->copied.nr;
+      save->vertex_store->used += save->vertex_size * save->copied.nr;
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
    }
 }
 
@@ -1193,6 +1259,8 @@ fixup_vertex(struct gl_context *ctx, GLuint attr,
    }
 
    save->active_sz[attr] = sz;
+
+   grow_vertex_storage(ctx, 1);
 }
 
 
@@ -1241,34 +1309,39 @@ is_vertex_position(const struct gl_context *ctx, GLuint index)
  * 3f version won't otherwise set color[3] to 1.0 -- this is the job
  * of the chooser function when switching between Color4f and Color3f.
  */
-#define ATTR_UNION(A, N, T, C, V0, V1, V2, V3)                 \
-do {                                                           \
-   struct vbo_save_context *save = &vbo_context(ctx)->save;    \
-   int sz = (sizeof(C) / sizeof(GLfloat));                     \
-                                                               \
-   if (save->active_sz[A] != N)                                        \
-      fixup_vertex(ctx, A, N * sz, T);                         \
-                                                               \
-   {                                                           \
+#define ATTR_UNION(A, N, T, C, V0, V1, V2, V3)                  \
+do {                                                            \
+   struct vbo_save_context *save = &vbo_context(ctx)->save;     \
+   int sz = (sizeof(C) / sizeof(GLfloat));                      \
+                                                                \
+   if (save->active_sz[A] != N)                                 \
+      fixup_vertex(ctx, A, N * sz, T);                          \
+                                                                \
+   {                                                            \
       C *dest = (C *)save->attrptr[A];                          \
-      if (N>0) dest[0] = V0;                                   \
-      if (N>1) dest[1] = V1;                                   \
-      if (N>2) dest[2] = V2;                                   \
-      if (N>3) dest[3] = V3;                                   \
-      save->attrtype[A] = T;                                   \
-   }                                                           \
-                                                               \
-   if ((A) == 0) {                                             \
-      GLuint i;                                                        \
-                                                               \
-      for (i = 0; i < save->vertex_size; i++)                  \
-        save->buffer_ptr[i] = save->vertex[i];                 \
-                                                               \
-      save->buffer_ptr += save->vertex_size;                   \
-                                                               \
-      if (++save->vert_count >= save->max_vert)                        \
-        wrap_filled_vertex(ctx);                               \
-   }                                                           \
+      if (N>0) dest[0] = V0;                                    \
+      if (N>1) dest[1] = V1;                                    \
+      if (N>2) dest[2] = V2;                                    \
+      if (N>3) dest[3] = V3;                                    \
+      save->attrtype[A] = T;                                    \
+   }                                                            \
+                                                                \
+   if ((A) == VBO_ATTRIB_POS) {                                 \
+      fi_type *buffer_ptr = save->vertex_store->buffer_in_ram + \
+                            save->vertex_store->used;           \
+                                                                \
+      for (int i = 0; i < save->vertex_size; i++)               \
+        buffer_ptr[i] = save->vertex[i];                        \
+                                                                \
+      save->vertex_store->used += save->vertex_size;            \
+      unsigned used_next = (save->vertex_store->used +          \
+                            save->vertex_size) * sizeof(float); \
+      if (used_next > save->vertex_store->buffer_in_ram_size) { \
+         grow_vertex_storage(ctx, get_vertex_count(save));      \
+         assert(used_next <=                                    \
+                save->vertex_store->buffer_in_ram_size);        \
+      }                                                         \
+   }                                                            \
 } while (0)
 
 #define TAG(x) _save_##x
@@ -1276,13 +1349,12 @@ do {                                                            \
 #include "vbo_attrib_tmp.h"
 
 
-
-#define MAT( ATTR, N, face, params )                   \
-do {                                                   \
-   if (face != GL_BACK)                                        \
-      MAT_ATTR( ATTR, N, params ); /* front */         \
-   if (face != GL_FRONT)                               \
-      MAT_ATTR( ATTR + 1, N, params ); /* back */      \
+#define MAT( ATTR, N, face, params )                            \
+do {                                                            \
+   if (face != GL_BACK)                                         \
+      MAT_ATTR( ATTR, N, params ); /* front */                  \
+   if (face != GL_FRONT)                                        \
+      MAT_ATTR( ATTR + 1, N, params ); /* back */               \
 } while (0)
 
 
@@ -1335,6 +1407,10 @@ _save_Materialfv(GLenum face, GLenum pname, const GLfloat *params)
 }
 
 
+static void
+vbo_init_dispatch_save_begin_end(struct gl_context *ctx);
+
+
 /* Cope with EvalCoord/CallList called within a begin/end object:
  *     -- Flush current buffer
  *     -- Fallback to opcodes for the rest of the begin/end object.
@@ -1344,11 +1420,14 @@ dlist_fallback(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   if (save->vert_count || save->prim_store->used) {
-      if (save->prim_store->used > 0) {
+   if (save->vertex_store->used || save->prim_store->used) {
+      if (save->prim_store->used > 0 && save->vertex_store->used > 0) {
+         assert(save->vertex_size);
          /* Close off in-progress primitive. */
          GLint i = save->prim_store->used - 1;
-         save->prim_store->prims[i].count = save->vert_count - save->prim_store->prims[i].start;
+         save->prim_store->prims[i].count =
+            get_vertex_count(save) -
+            save->prim_store->prims[i].start;
       }
 
       /* Need to replay this display list with loopback,
@@ -1362,12 +1441,11 @@ dlist_fallback(struct gl_context *ctx)
 
    copy_to_current(ctx);
    reset_vertex(ctx);
-   reset_counters(ctx);
    if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
+      vbo_install_save_vtxfmt_noop(ctx);
    }
    else {
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
@@ -1452,21 +1530,18 @@ vbo_save_NotifyBegin(struct gl_context *ctx, GLenum mode,
 
    ctx->Driver.CurrentSavePrimitive = mode;
 
-   assert(i < save->prim_store->size);
+   if (!save->prim_store || i >= save->prim_store->size) {
+      save->prim_store = realloc_prim_store(save->prim_store, i * 2);
+   }
    save->prim_store->prims[i].mode = mode & VBO_SAVE_PRIM_MODE_MASK;
    save->prim_store->prims[i].begin = 1;
    save->prim_store->prims[i].end = 0;
-   save->prim_store->prims[i].start = save->vert_count;
+   save->prim_store->prims[i].start = get_vertex_count(save);
    save->prim_store->prims[i].count = 0;
 
    save->no_current_update = no_current_update;
 
-   if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
-   }
-   else {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt);
-   }
+   vbo_init_dispatch_save_begin_end(ctx);
 
    /* We need to call vbo_save_SaveFlushVertices() if there's state change */
    ctx->Driver.SaveNeedFlush = GL_TRUE;
@@ -1482,22 +1557,17 @@ _save_End(void)
 
    ctx->Driver.CurrentSavePrimitive = PRIM_OUTSIDE_BEGIN_END;
    save->prim_store->prims[i].end = 1;
-   save->prim_store->prims[i].count = (save->vert_count - save->prim_store->prims[i].start);
-
-   if (i == (GLint) save->prim_store->size - 1) {
-      compile_vertex_list(ctx);
-      assert(save->copied.nr == 0);
-   }
+   save->prim_store->prims[i].count = (get_vertex_count(save) - save->prim_store->prims[i].start);
 
    /* Swap out this vertex format while outside begin/end.  Any color,
     * etc. received between here and the next begin will be compiled
     * as opcodes.
     */
    if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
+      vbo_install_save_vtxfmt_noop(ctx);
    }
    else {
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
 }
 
@@ -1535,13 +1605,8 @@ _save_PrimitiveRestartNV(void)
 }
 
 
-/* Unlike the functions above, these are to be hooked into the vtxfmt
- * maintained in ctx->ListState, active when the list is known or
- * suspected to be outside any begin/end primitive.
- * Note: OBE = Outside Begin/End
- */
-static void GLAPIENTRY
-_save_OBE_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
+void GLAPIENTRY
+save_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
@@ -1555,69 +1620,44 @@ _save_OBE_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
 }
 
 
-static void GLAPIENTRY
-_save_OBE_Rectd(GLdouble x1, GLdouble y1, GLdouble x2, GLdouble y2)
+void GLAPIENTRY
+save_Rectdv(const GLdouble *v1, const GLdouble *v2)
 {
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectdv(const GLdouble *v1, const GLdouble *v2)
+void GLAPIENTRY
+save_Rectfv(const GLfloat *v1, const GLfloat *v2)
 {
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
+   save_Rectf(v1[0], v1[1], v2[0], v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectfv(const GLfloat *v1, const GLfloat *v2)
+void GLAPIENTRY
+save_Recti(GLint x1, GLint y1, GLint x2, GLint y2)
 {
-   _save_OBE_Rectf(v1[0], v1[1], v2[0], v2[1]);
+   save_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
 }
 
-static void GLAPIENTRY
-_save_OBE_Recti(GLint x1, GLint y1, GLint x2, GLint y2)
+void GLAPIENTRY
+save_Rectiv(const GLint *v1, const GLint *v2)
 {
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectiv(const GLint *v1, const GLint *v2)
+void GLAPIENTRY
+save_Rects(GLshort x1, GLshort y1, GLshort x2, GLshort y2)
 {
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
+   save_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rects(GLshort x1, GLshort y1, GLshort x2, GLshort y2)
-{
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
-}
-
-static void GLAPIENTRY
-_save_OBE_Rectsv(const GLshort *v1, const GLshort *v2)
-{
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
-}
-
-static void
-_ensure_draws_fits_in_storage(struct gl_context *ctx, int primcount, int vertcount)
+void GLAPIENTRY
+save_Rectsv(const GLshort *v1, const GLshort *v2)
 {
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-
-   bool realloc_prim = save->prim_store->used + primcount > save->prim_store->size;
-   bool realloc_vert = save->vertex_size && (save->vert_count + vertcount >= save->max_vert);
-
-   if (realloc_prim || realloc_vert) {
-      if (save->vert_count || save->prim_store->used) {
-         /* TODO: this really isn't needed. We should realloc only the CPU-side memory. */
-         compile_vertex_list(ctx);
-      }
-      realloc_storage(ctx, realloc_prim ? primcount : -1, realloc_vert ? vertcount : -1);
-      reset_counters(ctx);
-   }
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-
-static void GLAPIENTRY
-_save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
+void GLAPIENTRY
+save_DrawArrays(GLenum mode, GLint start, GLsizei count)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_vertex_array_object *vao = ctx->Array.VAO;
@@ -1636,7 +1676,7 @@ _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
    if (save->out_of_memory)
       return;
 
-   _ensure_draws_fits_in_storage(ctx, 1, count);
+   grow_vertex_storage(ctx, count);
 
    /* Make sure to process any VBO binding changes */
    _mesa_update_state(ctx);
@@ -1653,9 +1693,9 @@ _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
 }
 
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawArrays(GLenum mode, const GLint *first,
-                          const GLsizei *count, GLsizei primcount)
+void GLAPIENTRY
+save_MultiDrawArrays(GLenum mode, const GLint *first,
+                      const GLsizei *count, GLsizei primcount)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint i;
@@ -1681,11 +1721,11 @@ _save_OBE_MultiDrawArrays(GLenum mode, const GLint *first,
       vertcount += count[i];
    }
 
-   _ensure_draws_fits_in_storage(ctx, primcount, vertcount);
+   grow_vertex_storage(ctx, vertcount);
 
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-         _save_OBE_DrawArrays(mode, first[i], count[i]);
+         save_DrawArrays(mode, first[i], count[i]);
       }
    }
 }
@@ -1717,9 +1757,9 @@ array_element(struct gl_context *ctx,
 /* Could do better by copying the arrays and element list intact and
  * then emitting an indexed prim at runtime.
  */
-static void GLAPIENTRY
-_save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
-                                 const GLvoid * indices, GLint basevertex)
+void GLAPIENTRY
+save_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
+                             const GLvoid * indices, GLint basevertex)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct vbo_save_context *save = &vbo_context(ctx)->save;
@@ -1745,7 +1785,7 @@ _save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
    if (save->out_of_memory)
       return;
 
-   _ensure_draws_fits_in_storage(ctx, 1, count);
+   grow_vertex_storage(ctx, count);
 
    /* Make sure to process any VBO binding changes */
    _mesa_update_state(ctx);
@@ -1781,16 +1821,16 @@ _save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
    _mesa_vao_unmap(ctx, vao);
 }
 
-static void GLAPIENTRY
-_save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
-                       const GLvoid * indices)
+void GLAPIENTRY
+save_DrawElements(GLenum mode, GLsizei count, GLenum type,
+                   const GLvoid * indices)
 {
-   _save_OBE_DrawElementsBaseVertex(mode, count, type, indices, 0);
+   save_DrawElementsBaseVertex(mode, count, type, indices, 0);
 }
 
 
-static void GLAPIENTRY
-_save_OBE_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
+void GLAPIENTRY
+save_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
                             GLsizei count, GLenum type,
                             const GLvoid * indices)
 {
@@ -1821,13 +1861,28 @@ _save_OBE_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
    if (save->out_of_memory)
       return;
 
-   _save_OBE_DrawElements(mode, count, type, indices);
+   save_DrawElements(mode, count, type, indices);
 }
 
+void GLAPIENTRY
+save_DrawRangeElementsBaseVertex(GLenum mode, GLuint start, GLuint end,
+                                 GLsizei count, GLenum type,
+                                 const GLvoid *indices, GLint basevertex)
+{
+   GET_CURRENT_CONTEXT(ctx);
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type,
-                            const GLvoid * const *indices, GLsizei primcount)
+   if (end < start) {
+      _mesa_compile_error(ctx, GL_INVALID_VALUE,
+                          "glDrawRangeElementsBaseVertex(end < start)");
+      return;
+   }
+
+   save_DrawElementsBaseVertex(mode, count, type, indices, basevertex);
+}
+
+void GLAPIENTRY
+save_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type,
+                       const GLvoid * const *indices, GLsizei primcount)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
@@ -1837,22 +1892,22 @@ _save_OBE_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type,
    for (i = 0; i < primcount; i++) {
       vertcount += count[i];
    }
-   _ensure_draws_fits_in_storage(ctx, primcount, vertcount);
+   grow_vertex_storage(ctx, vertcount);
 
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-        CALL_DrawElements(dispatch, (mode, count[i], type, indices[i]));
+         CALL_DrawElements(dispatch, (mode, count[i], type, indices[i]));
       }
    }
 }
 
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
-                                      GLenum type,
-                                      const GLvoid * const *indices,
-                                      GLsizei primcount,
-                                      const GLint *basevertex)
+void GLAPIENTRY
+save_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
+                                  GLenum type,
+                                  const GLvoid * const *indices,
+                                  GLsizei primcount,
+                                  const GLint *basevertex)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
@@ -1862,62 +1917,31 @@ _save_OBE_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
    for (i = 0; i < primcount; i++) {
       vertcount += count[i];
    }
-   _ensure_draws_fits_in_storage(ctx, primcount, vertcount);
+   grow_vertex_storage(ctx, vertcount);
 
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-        CALL_DrawElementsBaseVertex(dispatch, (mode, count[i], type,
-                                                     indices[i],
-                                                     basevertex[i]));
+         CALL_DrawElementsBaseVertex(dispatch, (mode, count[i], type,
+                                     indices[i],
+                                     basevertex[i]));
       }
    }
 }
 
 
 static void
-vtxfmt_init(struct gl_context *ctx)
+vbo_init_dispatch_save_begin_end(struct gl_context *ctx)
 {
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-   GLvertexformat *vfmt = &save->vtxfmt;
-
-#define NAME_AE(x) _ae_##x
+#define NAME_AE(x) _mesa_##x
 #define NAME_CALLLIST(x) _save_##x
 #define NAME(x) _save_##x
-#define NAME_ES(x) _save_##x##ARB
+#define NAME_ES(x) _save_##x
 
-#include "vbo_init_tmp.h"
+   struct _glapi_table *tab = ctx->Save;
+   #include "api_beginend_init.h"
 }
 
 
-/**
- * Initialize the dispatch table with the VBO functions for display
- * list compilation.
- */
-void
-vbo_initialize_save_dispatch(const struct gl_context *ctx,
-                             struct _glapi_table *exec)
-{
-   SET_DrawArrays(exec, _save_OBE_DrawArrays);
-   SET_MultiDrawArrays(exec, _save_OBE_MultiDrawArrays);
-   SET_DrawElements(exec, _save_OBE_DrawElements);
-   SET_DrawElementsBaseVertex(exec, _save_OBE_DrawElementsBaseVertex);
-   SET_DrawRangeElements(exec, _save_OBE_DrawRangeElements);
-   SET_MultiDrawElementsEXT(exec, _save_OBE_MultiDrawElements);
-   SET_MultiDrawElementsBaseVertex(exec, _save_OBE_MultiDrawElementsBaseVertex);
-   SET_Rectf(exec, _save_OBE_Rectf);
-   SET_Rectd(exec, _save_OBE_Rectd);
-   SET_Rectdv(exec, _save_OBE_Rectdv);
-   SET_Rectfv(exec, _save_OBE_Rectfv);
-   SET_Recti(exec, _save_OBE_Recti);
-   SET_Rectiv(exec, _save_OBE_Rectiv);
-   SET_Rects(exec, _save_OBE_Rects);
-   SET_Rectsv(exec, _save_OBE_Rectsv);
-
-   /* Note: other glDraw functins aren't compiled into display lists */
-}
-
-
-
 void
 vbo_save_SaveFlushVertices(struct gl_context *ctx)
 {
@@ -1928,12 +1952,11 @@ vbo_save_SaveFlushVertices(struct gl_context *ctx)
    if (ctx->Driver.CurrentSavePrimitive <= PRIM_MAX)
       return;
 
-   if (save->vert_count || save->prim_store->used)
+   if (save->vertex_store->used || save->prim_store->used)
       compile_vertex_list(ctx);
 
    copy_to_current(ctx);
    reset_vertex(ctx);
-   reset_counters(ctx);
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
 
@@ -1953,12 +1976,9 @@ vbo_save_NewList(struct gl_context *ctx, GLuint list, GLenum mode)
       save->prim_store = realloc_prim_store(NULL, 8);
 
    if (!save->vertex_store)
-      save->vertex_store = alloc_vertex_store(ctx, 0);
-
-   save->buffer_ptr = save->vertex_store->buffer_in_ram + save->vertex_store->used;
+      save->vertex_store = CALLOC_STRUCT(vbo_save_vertex_store);
 
    reset_vertex(ctx);
-   reset_counters(ctx);
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
 
@@ -1978,7 +1998,7 @@ vbo_save_EndList(struct gl_context *ctx)
          GLint i = save->prim_store->used - 1;
          ctx->Driver.CurrentSavePrimitive = PRIM_OUTSIDE_BEGIN_END;
          save->prim_store->prims[i].end = 0;
-         save->prim_store->prims[i].count = save->vert_count - save->prim_store->prims[i].start;
+         save->prim_store->prims[i].count = get_vertex_count(save) - save->prim_store->prims[i].start;
       }
 
       /* Make sure this vertex list gets replayed by the "loopback"
@@ -1991,7 +2011,7 @@ vbo_save_EndList(struct gl_context *ctx)
        * etc. received between here and the next begin will be compiled
        * as opcodes.
        */
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
 
    assert(save->vertex_size == 0);
@@ -2030,7 +2050,5 @@ vbo_save_api_init(struct vbo_save_context *save)
 {
    struct gl_context *ctx = gl_context_from_vbo_save(save);
 
-   vtxfmt_init(ctx);
    current_init(ctx);
-   _mesa_noop_vtxfmt_init(ctx, &save->vtxfmt_noop);
 }