gallium,util: Pull u_indices and u_primconvert back into gallium
[platform/upstream/mesa.git] / src / mesa / vbo / vbo_save_api.c
index ac02674..a41d518 100644 (file)
@@ -64,6 +64,39 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * This could be improved to fallback only when a mix of EvalCoord and
  * Vertex commands are issued within a single primitive.
+ *
+ * The compilation process works as follows. All vertex attributes
+ * except position are copied to vbo_save_context::attrptr (see ATTR_UNION).
+ * 'attrptr' are pointers to vbo_save_context::vertex ordered according to the enabled
+ * attributes (se upgrade_vertex).
+ * When the position attribute is received, all the attributes are then 
+ * copied to the vertex_store (see the end of ATTR_UNION).
+ * The vertex_store is simply an extensible float array.
+ * When the vertex list needs to be compiled (see compile_vertex_list),
+ * several transformations are performed:
+ *   - some primitives are merged together (eg: two consecutive GL_TRIANGLES
+ * with 3 vertices can be merged in a single GL_TRIANGLES with 6 vertices).
+ *   - an index buffer is built.
+ *   - identical vertices are detected and only one is kept.
+ * At the end of this transformation, the index buffer and the vertex buffer
+ * are uploaded in vRAM in the same buffer object.
+ * This buffer object is shared between multiple display list to allow
+ * draw calls merging later.
+ *
+ * The layout of this buffer for two display lists is:
+ *    V0A0|V0A1|V1A0|V1A1|P0I0|P0I1|V0A0V0A1V0A2|V1A1V1A1V1A2|...
+ *                                 ` new list starts
+ *        - VxAy: vertex x, attributes y
+ *        - PxIy: draw x, index y
+ *
+ * To allow draw call merging, display list must use the same VAO, including
+ * the same Offset in the buffer object. To achieve this, the start values of
+ * the primitive are shifted and the indices adjusted (see offset_diff and
+ * start_offset in compile_vertex_list).
+ *
+ * Display list using the loopback code (see vbo_save_playback_vertex_list_loopback),
+ * can't be drawn with an index buffer so this transformation is disabled
+ * in this case.
  */
 
 
@@ -77,27 +110,25 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/draw_validate.h"
 #include "main/api_arrayelt.h"
-#include "main/vtxfmt.h"
 #include "main/dispatch.h"
 #include "main/state.h"
 #include "main/varray.h"
 #include "util/bitscan.h"
 #include "util/u_memory.h"
+#include "util/hash_table.h"
+#include "gallium/auxiliary/indices/u_indices.h"
+#include "util/u_prim.h"
 
-#include "vbo_noop.h"
-#include "vbo_private.h"
+#include "gallium/include/pipe/p_state.h"
 
+#include "vbo_private.h"
+#include "api_exec_decl.h"
+#include "api_save.h"
 
 #ifdef ERROR
 #undef ERROR
 #endif
 
-/**
- * Display list flag only used by this VBO code.
- */
-#define DLIST_DANGLING_REFS     0x1
-
-
 /* An interesting VBO number/name to help with debugging */
 #define VBO_BUF_ID  12345
 
@@ -120,138 +151,38 @@ copy_vertices(struct gl_context *ctx,
               const fi_type * src_buffer)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   struct _mesa_prim *prim = &node->prims[node->prim_count - 1];
+   struct _mesa_prim *prim = &node->cold->prims[node->cold->prim_count - 1];
    GLuint sz = save->vertex_size;
-   const fi_type *src = src_buffer + prim->start * sz;
-   fi_type *dst = save->copied.buffer;
 
-   if (prim->end)
+   if (prim->end || !prim->count || !sz)
       return 0;
 
-   return vbo_copy_vertices(ctx, prim->mode, prim, sz, true, dst, src);
-}
-
-
-static struct vbo_save_vertex_store *
-alloc_vertex_store(struct gl_context *ctx)
-{
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-   struct vbo_save_vertex_store *vertex_store =
-      CALLOC_STRUCT(vbo_save_vertex_store);
-
-   /* obj->Name needs to be non-zero, but won't ever be examined more
-    * closely than that.  In particular these buffers won't be entered
-    * into the hash and can never be confused with ones visible to the
-    * user.  Perhaps there could be a special number for internal
-    * buffers:
-    */
-   vertex_store->bufferobj = ctx->Driver.NewBufferObject(ctx, VBO_BUF_ID);
-   if (vertex_store->bufferobj) {
-      save->out_of_memory =
-         !ctx->Driver.BufferData(ctx,
-                                 GL_ARRAY_BUFFER_ARB,
-                                 VBO_SAVE_BUFFER_SIZE * sizeof(GLfloat),
-                                 NULL, GL_STATIC_DRAW_ARB,
-                                 GL_MAP_WRITE_BIT |
-                                 GL_DYNAMIC_STORAGE_BIT,
-                                 vertex_store->bufferobj);
-   }
-   else {
-      save->out_of_memory = GL_TRUE;
-   }
-
-   if (save->out_of_memory) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "internal VBO allocation");
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
-   }
-
-   vertex_store->buffer_map = NULL;
-   vertex_store->used = 0;
-
-   return vertex_store;
-}
-
-
-static void
-free_vertex_store(struct gl_context *ctx,
-                  struct vbo_save_vertex_store *vertex_store)
-{
-   assert(!vertex_store->buffer_map);
+   const fi_type *src = src_buffer + prim->start * sz;
+   assert(save->copied.buffer == NULL);
+   save->copied.buffer = malloc(sizeof(fi_type) * sz * prim->count);
 
-   if (vertex_store->bufferobj) {
-      _mesa_reference_buffer_object(ctx, &vertex_store->bufferobj, NULL);
+   unsigned r = vbo_copy_vertices(ctx, prim->mode, prim->start, &prim->count,
+                                  prim->begin, sz, true, save->copied.buffer, src);
+   if (!r) {
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
    }
-
-   free(vertex_store);
+   return r;
 }
 
 
-fi_type *
-vbo_save_map_vertex_store(struct gl_context *ctx,
-                          struct vbo_save_vertex_store *vertex_store)
-{
-   const GLbitfield access = (GL_MAP_WRITE_BIT |
-                              GL_MAP_INVALIDATE_RANGE_BIT |
-                              GL_MAP_UNSYNCHRONIZED_BIT |
-                              GL_MAP_FLUSH_EXPLICIT_BIT |
-                              MESA_MAP_ONCE);
-
-   assert(vertex_store->bufferobj);
-   assert(!vertex_store->buffer_map);  /* the buffer should not be mapped */
-
-   if (vertex_store->bufferobj->Size > 0) {
-      /* Map the remaining free space in the VBO */
-      GLintptr offset = vertex_store->used * sizeof(GLfloat);
-      GLsizeiptr size = vertex_store->bufferobj->Size - offset;
-      fi_type *range = (fi_type *)
-         ctx->Driver.MapBufferRange(ctx, offset, size, access,
-                                    vertex_store->bufferobj,
-                                    MAP_INTERNAL);
-      if (range) {
-         /* compute address of start of whole buffer (needed elsewhere) */
-         vertex_store->buffer_map = range - vertex_store->used;
-         assert(vertex_store->buffer_map);
-         return range;
-      }
-      else {
-         vertex_store->buffer_map = NULL;
-         return NULL;
-      }
-   }
-   else {
-      /* probably ran out of memory for buffers */
-      return NULL;
-   }
-}
-
-
-void
-vbo_save_unmap_vertex_store(struct gl_context *ctx,
-                            struct vbo_save_vertex_store *vertex_store)
+static struct vbo_save_primitive_store *
+realloc_prim_store(struct vbo_save_primitive_store *store, int prim_count)
 {
-   if (vertex_store->bufferobj->Size > 0) {
-      GLintptr offset = 0;
-      GLsizeiptr length = vertex_store->used * sizeof(GLfloat)
-         - vertex_store->bufferobj->Mappings[MAP_INTERNAL].Offset;
-
-      /* Explicitly flush the region we wrote to */
-      ctx->Driver.FlushMappedBufferRange(ctx, offset, length,
-                                         vertex_store->bufferobj,
-                                         MAP_INTERNAL);
-
-      ctx->Driver.UnmapBuffer(ctx, vertex_store->bufferobj, MAP_INTERNAL);
-   }
-   vertex_store->buffer_map = NULL;
-}
+   if (store == NULL)
+      store = CALLOC_STRUCT(vbo_save_primitive_store);
 
+   uint32_t old_size = store->size;
+   store->size = prim_count;
+   assert (old_size < store->size);
+   store->prims = realloc(store->prims, store->size * sizeof(struct _mesa_prim));
+   memset(&store->prims[old_size], 0, (store->size - old_size) * sizeof(struct _mesa_prim));
 
-static struct vbo_save_primitive_store *
-alloc_prim_store(void)
-{
-   struct vbo_save_primitive_store *store =
-      CALLOC_STRUCT(vbo_save_primitive_store);
-   store->used = 0;
-   store->refcount = 1;
    return store;
 }
 
@@ -261,20 +192,8 @@ reset_counters(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   save->prims = save->prim_store->prims + save->prim_store->used;
-   save->buffer_map = save->vertex_store->buffer_map + save->vertex_store->used;
-
-   assert(save->buffer_map == save->buffer_ptr);
-
-   if (save->vertex_size)
-      save->max_vert = (VBO_SAVE_BUFFER_SIZE - save->vertex_store->used) /
-                        save->vertex_size;
-   else
-      save->max_vert = 0;
-
-   save->vert_count = 0;
-   save->prim_count = 0;
-   save->prim_max = VBO_SAVE_PRIM_SIZE - save->prim_store->used;
+   save->vertex_store->used = 0;
+   save->prim_store->used = 0;
    save->dangling_attr_ref = GL_FALSE;
 }
 
@@ -292,9 +211,15 @@ merge_prims(struct gl_context *ctx, struct _mesa_prim *prim_list,
    for (i = 1; i < *prim_count; i++) {
       struct _mesa_prim *this_prim = prim_list + i;
 
-      vbo_try_prim_conversion(this_prim);
+      vbo_try_prim_conversion(&this_prim->mode, &this_prim->count);
 
-      if (vbo_merge_draws(ctx, true, prev_prim, this_prim)) {
+      if (vbo_merge_draws(ctx, true,
+                          prev_prim->mode, this_prim->mode,
+                          prev_prim->start, this_prim->start,
+                          &prev_prim->count, this_prim->count,
+                          prev_prim->basevertex, this_prim->basevertex,
+                          &prev_prim->end,
+                          this_prim->begin, this_prim->end)) {
          /* We've found a prim that just extend the previous one.  Tack it
           * onto the previous one, and let this primitive struct get dropped.
           */
@@ -322,7 +247,7 @@ static void
 convert_line_loop_to_strip(struct vbo_save_context *save,
                            struct vbo_save_vertex_list *node)
 {
-   struct _mesa_prim *prim = &node->prims[node->prim_count - 1];
+   struct _mesa_prim *prim = &node->cold->prims[node->cold->prim_count - 1];
 
    assert(prim->mode == GL_LINE_LOOP);
 
@@ -332,16 +257,14 @@ convert_line_loop_to_strip(struct vbo_save_context *save,
        */
       const GLuint sz = save->vertex_size;
       /* 0th vertex: */
-      const fi_type *src = save->buffer_map + prim->start * sz;
+      const fi_type *src = save->vertex_store->buffer_in_ram + prim->start * sz;
       /* end of buffer: */
-      fi_type *dst = save->buffer_map + (prim->start + prim->count) * sz;
+      fi_type *dst = save->vertex_store->buffer_in_ram + (prim->start + prim->count) * sz;
 
       memcpy(dst, src, sz * sizeof(float));
 
       prim->count++;
-      node->vertex_count++;
-      save->vert_count++;
-      save->buffer_ptr += sz;
+      node->cold->vertex_count++;
       save->vertex_store->used += sz;
    }
 
@@ -469,6 +392,117 @@ update_vao(struct gl_context *ctx,
    _mesa_set_vao_immutable(ctx, *vao);
 }
 
+static void wrap_filled_vertex(struct gl_context *ctx);
+
+/* Grow the vertex storage to accomodate for vertex_count new vertices */
+static void
+grow_vertex_storage(struct gl_context *ctx, int vertex_count)
+{
+   struct vbo_save_context *save = &vbo_context(ctx)->save;
+   assert (save->vertex_store);
+
+   int new_size = (save->vertex_store->used +
+                   vertex_count * save->vertex_size) * sizeof(GLfloat);
+
+   /* Limit how much memory we allocate. */
+   if (save->prim_store->used > 0 &&
+       vertex_count > 0 &&
+       new_size > VBO_SAVE_BUFFER_SIZE) {
+      wrap_filled_vertex(ctx);
+      new_size = VBO_SAVE_BUFFER_SIZE;
+   }
+
+   if (new_size > save->vertex_store->buffer_in_ram_size) {
+      save->vertex_store->buffer_in_ram_size = new_size;
+      save->vertex_store->buffer_in_ram = realloc(save->vertex_store->buffer_in_ram,
+                                                  save->vertex_store->buffer_in_ram_size);
+      if (save->vertex_store->buffer_in_ram == NULL)
+         save->out_of_memory = true;
+   }
+}
+
+struct vertex_key {
+   unsigned vertex_size;
+   fi_type *vertex_attributes;
+};
+
+static uint32_t _hash_vertex_key(const void *key)
+{
+   struct vertex_key *k = (struct vertex_key*)key;
+   unsigned sz = k->vertex_size;
+   assert(sz);
+   return _mesa_hash_data(k->vertex_attributes, sz * sizeof(float));
+}
+
+static bool _compare_vertex_key(const void *key1, const void *key2)
+{
+   struct vertex_key *k1 = (struct vertex_key*)key1;
+   struct vertex_key *k2 = (struct vertex_key*)key2;
+   /* All the compared vertices are going to be drawn with the same VAO,
+    * so we can compare the attributes. */
+   assert (k1->vertex_size == k2->vertex_size);
+   return memcmp(k1->vertex_attributes,
+                 k2->vertex_attributes,
+                 k1->vertex_size * sizeof(float)) == 0;
+}
+
+static void _free_entry(struct hash_entry *entry)
+{
+   free((void*)entry->key);
+}
+
+/* Add vertex to the vertex buffer and return its index. If this vertex is a duplicate
+ * of an existing vertex, return the original index instead.
+ */
+static uint32_t
+add_vertex(struct vbo_save_context *save, struct hash_table *hash_to_index,
+           uint32_t index, fi_type *new_buffer, uint32_t *max_index)
+{
+   /* If vertex deduplication is disabled return the original index. */
+   if (!hash_to_index)
+      return index;
+
+   fi_type *vert = save->vertex_store->buffer_in_ram + save->vertex_size * index;
+
+   struct vertex_key *key = malloc(sizeof(struct vertex_key));
+   key->vertex_size = save->vertex_size;
+   key->vertex_attributes = vert;
+
+   struct hash_entry *entry = _mesa_hash_table_search(hash_to_index, key);
+   if (entry) {
+      free(key);
+      /* We found an existing vertex with the same hash, return its index. */
+      return (uintptr_t) entry->data;
+   } else {
+      /* This is a new vertex. Determine a new index and copy its attributes to the vertex
+       * buffer. Note that 'new_buffer' is created at each list compilation so we write vertices
+       * starting at index 0.
+       */
+      uint32_t n = _mesa_hash_table_num_entries(hash_to_index);
+      *max_index = MAX2(n, *max_index);
+
+      memcpy(&new_buffer[save->vertex_size * n],
+             vert,
+             save->vertex_size * sizeof(fi_type));
+
+      _mesa_hash_table_insert(hash_to_index, key, (void*)(uintptr_t)(n));
+
+      /* The index buffer is shared between list compilations, so add the base index to get
+       * the final index.
+       */
+      return n;
+   }
+}
+
+
+static uint32_t
+get_vertex_count(struct vbo_save_context *save)
+{
+   if (!save->vertex_size)
+      return 0;
+   return save->vertex_store->used / save->vertex_size;
+}
+
 
 /**
  * Insert the active immediate struct onto the display list currently
@@ -484,245 +518,473 @@ compile_vertex_list(struct gl_context *ctx)
     * being compiled.
     */
    node = (struct vbo_save_vertex_list *)
-      _mesa_dlist_alloc_aligned(ctx, save->opcode_vertex_list, sizeof(*node));
+      _mesa_dlist_alloc_vertex_list(ctx, !save->dangling_attr_ref && !save->no_current_update);
 
    if (!node)
       return;
 
+   node->cold = calloc(1, sizeof(*node->cold));
+
    /* Make sure the pointer is aligned to the size of a pointer */
    assert((GLintptr) node % sizeof(void *) == 0);
 
-   /* Duplicate our template, increment refcounts to the storage structs:
-    */
-   GLintptr old_offset = 0;
-   if (save->VAO[0]) {
-      old_offset = save->VAO[0]->BufferBinding[0].Offset
-         + save->VAO[0]->VertexAttrib[VERT_ATTRIB_POS].RelativeOffset;
-   }
    const GLsizei stride = save->vertex_size*sizeof(GLfloat);
-   GLintptr buffer_offset =
-       (save->buffer_map - save->vertex_store->buffer_map) * sizeof(GLfloat);
-   assert(old_offset <= buffer_offset);
-   const GLintptr offset_diff = buffer_offset - old_offset;
-   GLuint start_offset = 0;
-   if (offset_diff > 0 && stride > 0 && offset_diff % stride == 0) {
-      /* The vertex size is an exact multiple of the buffer offset.
-       * This means that we can use zero-based vertex attribute pointers
-       * and specify the start of the primitive with the _mesa_prim::start
-       * field.  This results in issuing several draw calls with identical
-       * vertex attribute information.  This can result in fewer state
-       * changes in drivers.  In particular, the Gallium CSO module will
-       * filter out redundant vertex buffer changes.
-       */
-      /* We cannot immediately update the primitives as some methods below
-       * still need the uncorrected start vertices
-       */
-      start_offset = offset_diff/stride;
-      assert(old_offset == buffer_offset - offset_diff);
-      buffer_offset = old_offset;
-   }
-   GLuint offsets[VBO_ATTRIB_MAX];
-   for (unsigned i = 0, offset = 0; i < VBO_ATTRIB_MAX; ++i) {
-      offsets[i] = offset;
-      offset += save->attrsz[i] * sizeof(GLfloat);
-   }
-   node->vertex_count = save->vert_count;
-   node->wrap_count = save->copied.nr;
-   node->prims = save->prims;
-   node->prim_count = save->prim_count;
-   node->prim_store = save->prim_store;
-
-   /* Create a pair of VAOs for the possible VERTEX_PROCESSING_MODEs
-    * Note that this may reuse the previous one of possible.
-    */
-   for (gl_vertex_processing_mode vpm = VP_MODE_FF; vpm < VP_MODE_MAX; ++vpm) {
-      /* create or reuse the vao */
-      update_vao(ctx, vpm, &save->VAO[vpm],
-                 save->vertex_store->bufferobj, buffer_offset, stride,
-                 save->enabled, save->attrsz, save->attrtype, offsets);
-      /* Reference the vao in the dlist */
-      node->VAO[vpm] = NULL;
-      _mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
-   }
 
-   node->prim_store->refcount++;
+   node->cold->vertex_count = get_vertex_count(save);
+   node->cold->wrap_count = save->copied.nr;
+   node->cold->prims = malloc(sizeof(struct _mesa_prim) * save->prim_store->used);
+   memcpy(node->cold->prims, save->prim_store->prims, sizeof(struct _mesa_prim) * save->prim_store->used);
+   node->cold->ib.obj = NULL;
+   node->cold->prim_count = save->prim_store->used;
 
    if (save->no_current_update) {
-      node->current_data = NULL;
+      node->cold->current_data = NULL;
    }
    else {
       GLuint current_size = save->vertex_size - save->attrsz[0];
-      node->current_data = NULL;
+      node->cold->current_data = NULL;
 
       if (current_size) {
-         node->current_data = malloc(current_size * sizeof(GLfloat));
-         if (node->current_data) {
-            const char *buffer = (const char *)save->buffer_map;
+         node->cold->current_data = malloc(current_size * sizeof(GLfloat));
+         if (node->cold->current_data) {
+            const char *buffer = (const char *)save->vertex_store->buffer_in_ram;
             unsigned attr_offset = save->attrsz[0] * sizeof(GLfloat);
             unsigned vertex_offset = 0;
 
-            if (node->vertex_count)
-               vertex_offset = (node->vertex_count - 1) * stride;
+            if (node->cold->vertex_count)
+               vertex_offset = (node->cold->vertex_count - 1) * stride;
 
-            memcpy(node->current_data, buffer + vertex_offset + attr_offset,
+            memcpy(node->cold->current_data, buffer + vertex_offset + attr_offset,
                    current_size * sizeof(GLfloat));
          } else {
             _mesa_error(ctx, GL_OUT_OF_MEMORY, "Current value allocation");
+            save->out_of_memory = true;
          }
       }
    }
 
-   assert(save->attrsz[VBO_ATTRIB_POS] != 0 || node->vertex_count == 0);
+   assert(save->attrsz[VBO_ATTRIB_POS] != 0 || node->cold->vertex_count == 0);
 
    if (save->dangling_attr_ref)
-      ctx->ListState.CurrentList->Flags |= DLIST_DANGLING_REFS;
-
-   save->vertex_store->used += save->vertex_size * node->vertex_count;
-   save->prim_store->used += node->prim_count;
+      ctx->ListState.Current.UseLoopback = true;
 
    /* Copy duplicated vertices
     */
-   save->copied.nr = copy_vertices(ctx, node, save->buffer_map);
+   save->copied.nr = copy_vertices(ctx, node, save->vertex_store->buffer_in_ram);
 
-   if (node->prims[node->prim_count - 1].mode == GL_LINE_LOOP) {
+   if (node->cold->prims[node->cold->prim_count - 1].mode == GL_LINE_LOOP) {
       convert_line_loop_to_strip(save, node);
    }
 
-   merge_prims(ctx, node->prims, &node->prim_count);
+   merge_prims(ctx, node->cold->prims, &node->cold->prim_count);
 
-   /* Correct the primitive starts, we can only do this here as copy_vertices
-    * and convert_line_loop_to_strip above consume the uncorrected starts.
-    * On the other hand the _vbo_loopback_vertex_list call below needs the
-    * primitves to be corrected already.
+   GLintptr buffer_offset = 0;
+   GLuint start_offset = 0;
+
+   /* Create an index buffer. */
+   node->cold->min_index = node->cold->max_index = 0;
+   if (node->cold->vertex_count == 0 || node->cold->prim_count == 0)
+      goto end;
+
+   /* We won't modify node->prims, so use a const alias to avoid unintended
+    * writes to it. */
+   const struct _mesa_prim *original_prims = node->cold->prims;
+
+   int end = original_prims[node->cold->prim_count - 1].start +
+             original_prims[node->cold->prim_count - 1].count;
+   int total_vert_count = end - original_prims[0].start;
+
+   node->cold->min_index = node->cold->prims[0].start;
+   node->cold->max_index = end - 1;
+
+   /* converting primitive types may result in many more indices */
+   bool all_prims_supported = (ctx->Const.DriverSupportedPrimMask & BITFIELD_MASK(PIPE_PRIM_MAX)) == BITFIELD_MASK(PIPE_PRIM_MAX);
+   int max_index_count = total_vert_count * (all_prims_supported ? 2 : 3);
+   uint32_t* indices = (uint32_t*) malloc(max_index_count * sizeof(uint32_t));
+   void *tmp_indices = all_prims_supported ? NULL : malloc(max_index_count * sizeof(uint32_t));
+   struct _mesa_prim *merged_prims = NULL;
+
+   int idx = 0;
+   struct hash_table *vertex_to_index = NULL;
+   fi_type *temp_vertices_buffer = NULL;
+
+   /* The loopback replay code doesn't use the index buffer, so we can't
+    * dedup vertices in this case.
     */
-   for (unsigned i = 0; i < node->prim_count; i++) {
-      node->prims[i].start += start_offset;
+   if (!ctx->ListState.Current.UseLoopback) {
+      vertex_to_index = _mesa_hash_table_create(NULL, _hash_vertex_key, _compare_vertex_key);
+      temp_vertices_buffer = malloc(save->vertex_store->buffer_in_ram_size);
    }
 
-   /* Create an index buffer. */
-   node->min_index = node->max_index = 0;
-   if (save->vert_count) {
-      int end = node->prims[node->prim_count - 1].start +
-                node->prims[node->prim_count - 1].count;
-      int total_vert_count = end - node->prims[0].start;
-      int max_indices_count = total_vert_count * 2;
-      int size = max_indices_count * sizeof(uint32_t);
-      uint32_t* indices = (uint32_t*) malloc(size);
-      uint32_t max_index = 0, min_index = 0xFFFFFFFF;
-
-      int idx = 0;
-
-      /* Construct indices array. */
-      for (unsigned i = 0; i < node->prim_count; i++) {
-         assert(node->prims[i].basevertex == 0);
-         int vertex_count = node->prims[i].count;
-         int start = idx;
-
-         /* Convert line strips to lines if it'll allow if the previous
-          * prim mode is GL_LINES or if the next primitive mode is
-          * GL_LINES or GL_LINE_LOOP.
-          */
-         if (node->prims[i].mode == GL_LINE_STRIP &&
-             ((i > 0 && node->prims[i - 1].mode == GL_LINES) ||
-              (i < node->prim_count - 1 &&
-               (node->prims[i + 1].mode == GL_LINE_STRIP ||
-                node->prims[i + 1].mode == GL_LINES)))) {
-            for (unsigned j = 0; j < vertex_count; j++) {
-               indices[idx++] = node->prims[i].start + j;
-               /* Repeat all but the first/last indices. */
-               if (j && j != vertex_count - 1) {
-                  indices[idx++] = node->prims[i].start + j;
-                  node->prims[i].count++;
-               }
-            }
-            node->prims[i].mode = GL_LINES;
-         } else {
-            for (unsigned j = 0; j < vertex_count; j++) {
-               indices[idx++] = node->prims[i].start + j;
-            }
-         }
+   uint32_t max_index = 0;
 
-         min_index = MIN2(min_index, indices[start]);
-         max_index = MAX2(max_index, indices[idx - 1]);
+   int last_valid_prim = -1;
+   /* Construct indices array. */
+   for (unsigned i = 0; i < node->cold->prim_count; i++) {
+      assert(original_prims[i].basevertex == 0);
+      GLubyte mode = original_prims[i].mode;
+      bool converted_prim = false;
+      unsigned index_size;
 
-         node->prims[i].start = start;
+      int vertex_count = original_prims[i].count;
+      if (!vertex_count) {
+         continue;
       }
 
-      assert(idx <= max_indices_count);
+      /* Increase indices storage if the original estimation was too small. */
+      if (idx + 3 * vertex_count > max_index_count) {
+         max_index_count = max_index_count + 3 * vertex_count;
+         indices = (uint32_t*) realloc(indices, max_index_count * sizeof(uint32_t));
+         tmp_indices = all_prims_supported ? NULL : realloc(tmp_indices, max_index_count * sizeof(uint32_t));
+      }
 
-      node->ib.ptr = NULL;
-      node->ib.count = idx;
-      node->ib.index_size_shift = (GL_UNSIGNED_INT - GL_UNSIGNED_BYTE) >> 1;
+      /* Line strips may get converted to lines */
+      if (mode == GL_LINE_STRIP)
+         mode = GL_LINES;
+
+      if (!(ctx->Const.DriverSupportedPrimMask & BITFIELD_BIT(mode))) {
+         unsigned new_count;
+         u_generate_func trans_func;
+         enum pipe_prim_type pmode = (enum pipe_prim_type)mode;
+         u_index_generator(ctx->Const.DriverSupportedPrimMask,
+                           pmode, original_prims[i].start, vertex_count,
+                           PV_LAST, PV_LAST,
+                           &pmode, &index_size, &new_count,
+                           &trans_func);
+         if (new_count > 0)
+            trans_func(original_prims[i].start, new_count, tmp_indices);
+         vertex_count = new_count;
+         mode = (GLubyte)pmode;
+         converted_prim = true;
+      }
 
-      node->min_index = min_index;
-      node->max_index = max_index;
+      /* If 2 consecutive prims use the same mode => merge them. */
+      bool merge_prims = last_valid_prim >= 0 &&
+                         mode == merged_prims[last_valid_prim].mode &&
+                         mode != GL_LINE_LOOP && mode != GL_TRIANGLE_FAN &&
+                         mode != GL_QUAD_STRIP && mode != GL_POLYGON &&
+                         mode != GL_PATCHES;
+
+/* index generation uses uint16_t if the index count is small enough */
+#define CAST_INDEX(BASE, SIZE, IDX) ((SIZE == 2 ? (uint32_t)(((uint16_t*)BASE)[IDX]) : ((uint32_t*)BASE)[IDX]))
+      /* To be able to merge consecutive triangle strips we need to insert
+       * a degenerate triangle.
+       */
+      if (merge_prims &&
+          mode == GL_TRIANGLE_STRIP) {
+         /* Insert a degenerate triangle */
+         assert(merged_prims[last_valid_prim].mode == GL_TRIANGLE_STRIP);
+         unsigned tri_count = merged_prims[last_valid_prim].count - 2;
+
+         indices[idx] = indices[idx - 1];
+         indices[idx + 1] = add_vertex(save, vertex_to_index,
+                                       converted_prim ? CAST_INDEX(tmp_indices, index_size, 0) : original_prims[i].start,
+                                       temp_vertices_buffer, &max_index);
+         idx += 2;
+         merged_prims[last_valid_prim].count += 2;
+
+         if (tri_count % 2) {
+            /* Add another index to preserve winding order */
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, 0) : original_prims[i].start,
+                                        temp_vertices_buffer, &max_index);
+            merged_prims[last_valid_prim].count++;
+         }
+      }
+
+      int start = idx;
+
+      /* Convert line strips to lines if it'll allow if the previous
+       * prim mode is GL_LINES (so merge_prims is true) or if the next
+       * primitive mode is GL_LINES or GL_LINE_LOOP.
+       */
+      if (original_prims[i].mode == GL_LINE_STRIP &&
+          (merge_prims ||
+           (i < node->cold->prim_count - 1 &&
+            (original_prims[i + 1].mode == GL_LINE_STRIP ||
+             original_prims[i + 1].mode == GL_LINES)))) {
+         for (unsigned j = 0; j < vertex_count; j++) {
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
+                                        temp_vertices_buffer, &max_index);
+            /* Repeat all but the first/last indices. */
+            if (j && j != vertex_count - 1) {
+               indices[idx++] = add_vertex(save, vertex_to_index,
+                                           converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
+                                           temp_vertices_buffer, &max_index);
+            }
+         }
+      } else {
+         /* We didn't convert to LINES, so restore the original mode */
+         if (!converted_prim)
+            mode = original_prims[i].mode;
+
+         for (unsigned j = 0; j < vertex_count; j++) {
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                        converted_prim ? CAST_INDEX(tmp_indices, index_size, j) : original_prims[i].start + j,
+                                        temp_vertices_buffer, &max_index);
+         }
+      }
 
-      node->ib.obj = ctx->Driver.NewBufferObject(ctx, VBO_BUF_ID + 1);
-      bool success = ctx->Driver.BufferData(ctx,
-                                            GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                            idx * sizeof(uint32_t), indices,
-                                            GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT,
-                                            node->ib.obj);
-      assert(success);
+      /* Duplicate the last vertex for incomplete primitives */
+      if (vertex_count > 0) {
+         unsigned min_vert = u_prim_vertex_count(mode)->min;
+         for (unsigned j = vertex_count; j < min_vert; j++) {
+            indices[idx++] = add_vertex(save, vertex_to_index,
+                                       converted_prim ? CAST_INDEX(tmp_indices, index_size, vertex_count - 1) :
+                                                         original_prims[i].start + vertex_count - 1,
+                                       temp_vertices_buffer, &max_index);
+         }
+      }
+
+#undef CAST_INDEX
+      if (merge_prims) {
+         /* Update vertex count. */
+         merged_prims[last_valid_prim].count += idx - start;
+      } else {
+         /* Keep this primitive */
+         last_valid_prim += 1;
+         assert(last_valid_prim <= i);
+         merged_prims = realloc(merged_prims, (1 + last_valid_prim) * sizeof(struct _mesa_prim));
+         merged_prims[last_valid_prim] = original_prims[i];
+         merged_prims[last_valid_prim].start = start;
+         merged_prims[last_valid_prim].count = idx - start;
+      }
+      merged_prims[last_valid_prim].mode = mode;
+
+      /* converted prims will filter incomplete primitives and may have no indices */
+      assert((idx > 0 || converted_prim) && idx <= max_index_count);
+   }
+
+   unsigned merged_prim_count = last_valid_prim + 1;
+   node->cold->ib.ptr = NULL;
+   node->cold->ib.count = idx;
+   node->cold->ib.index_size_shift = (GL_UNSIGNED_INT - GL_UNSIGNED_BYTE) >> 1;
+
+   /* How many bytes do we need to store the indices and the vertices */
+   total_vert_count = vertex_to_index ? (max_index + 1) : idx;
+   unsigned total_bytes_needed = idx * sizeof(uint32_t) +
+                                 total_vert_count * save->vertex_size * sizeof(fi_type);
+
+   const GLintptr old_offset = save->VAO[0] ?
+      save->VAO[0]->BufferBinding[0].Offset + save->VAO[0]->VertexAttrib[VERT_ATTRIB_POS].RelativeOffset : 0;
+   if (old_offset != save->current_bo_bytes_used && stride > 0) {
+      GLintptr offset_diff = save->current_bo_bytes_used - old_offset;
+      while (offset_diff > 0 &&
+             save->current_bo_bytes_used < save->current_bo->Size &&
+             offset_diff % stride != 0) {
+         save->current_bo_bytes_used++;
+         offset_diff = save->current_bo_bytes_used - old_offset;
+      }
+   }
+   buffer_offset = save->current_bo_bytes_used;
+
+   /* Can we reuse the previous bo or should we allocate a new one? */
+   int available_bytes = save->current_bo ? save->current_bo->Size - save->current_bo_bytes_used : 0;
+   if (total_bytes_needed > available_bytes) {
+      if (save->current_bo)
+         _mesa_reference_buffer_object(ctx, &save->current_bo, NULL);
+      save->current_bo = _mesa_bufferobj_alloc(ctx, VBO_BUF_ID + 1);
+      bool success = _mesa_bufferobj_data(ctx,
+                                          GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                          MAX2(total_bytes_needed, VBO_SAVE_BUFFER_SIZE),
+                                          NULL,
+                                          GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT |
+                                          MESA_GALLIUM_VERTEX_STATE_STORAGE,
+                                          save->current_bo);
       if (!success) {
-         node->min_index = node->max_index = 0;
-         ctx->Driver.DeleteBuffer(ctx, node->ib.obj);
-         node->ib.obj = NULL;
-         node->vertex_count = 0;
+         _mesa_reference_buffer_object(ctx, &save->current_bo, NULL);
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "IB allocation");
+         save->out_of_memory = true;
+      } else {
+         save->current_bo_bytes_used = 0;
+         available_bytes = save->current_bo->Size;
       }
-      free(indices);
+      buffer_offset = 0;
    } else {
-      node->ib.obj = NULL;
+      assert(old_offset <= buffer_offset);
+      const GLintptr offset_diff = buffer_offset - old_offset;
+      if (offset_diff > 0 && stride > 0 && offset_diff % stride == 0) {
+         /* The vertex size is an exact multiple of the buffer offset.
+          * This means that we can use zero-based vertex attribute pointers
+          * and specify the start of the primitive with the _mesa_prim::start
+          * field.  This results in issuing several draw calls with identical
+          * vertex attribute information.  This can result in fewer state
+          * changes in drivers.  In particular, the Gallium CSO module will
+          * filter out redundant vertex buffer changes.
+          */
+         /* We cannot immediately update the primitives as some methods below
+          * still need the uncorrected start vertices
+          */
+         start_offset = offset_diff/stride;
+         assert(old_offset == buffer_offset - offset_diff);
+         buffer_offset = old_offset;
+      }
+
+      /* Correct the primitive starts, we can only do this here as copy_vertices
+       * and convert_line_loop_to_strip above consume the uncorrected starts.
+       * On the other hand the _vbo_loopback_vertex_list call below needs the
+       * primitives to be corrected already.
+       */
+      for (unsigned i = 0; i < node->cold->prim_count; i++) {
+         node->cold->prims[i].start += start_offset;
+      }
+      /* start_offset shifts vertices (so v[0] becomes v[start_offset]), so we have
+       * to apply this transformation to all indices and max_index.
+       */
+      for (unsigned i = 0; i < idx; i++)
+         indices[i] += start_offset;
+      max_index += start_offset;
+   }
+
+   _mesa_reference_buffer_object(ctx, &node->cold->ib.obj, save->current_bo);
+
+   /* Upload the vertices first (see buffer_offset) */
+   _mesa_bufferobj_subdata(ctx,
+                           save->current_bo_bytes_used,
+                           total_vert_count * save->vertex_size * sizeof(fi_type),
+                           vertex_to_index ? temp_vertices_buffer : save->vertex_store->buffer_in_ram,
+                           node->cold->ib.obj);
+   save->current_bo_bytes_used += total_vert_count * save->vertex_size * sizeof(fi_type);
+   node->cold->bo_bytes_used = save->current_bo_bytes_used;
+
+  if (vertex_to_index) {
+      _mesa_hash_table_destroy(vertex_to_index, _free_entry);
+      free(temp_vertices_buffer);
+   }
+
+   /* Since we append the indices to an existing buffer, we need to adjust the start value of each
+    * primitive (not the indices themselves). */
+   if (!ctx->ListState.Current.UseLoopback) {
+      save->current_bo_bytes_used += align(save->current_bo_bytes_used, 4) - save->current_bo_bytes_used;
+      int indices_offset = save->current_bo_bytes_used / 4;
+      for (int i = 0; i < merged_prim_count; i++) {
+         merged_prims[i].start += indices_offset;
+      }
    }
 
-   /* Deal with GL_COMPILE_AND_EXECUTE:
-    */
-   if (ctx->ExecuteFlag) {
-      struct _glapi_table *dispatch = GET_DISPATCH();
+   /* Then upload the indices. */
+   if (node->cold->ib.obj) {
+      _mesa_bufferobj_subdata(ctx,
+                              save->current_bo_bytes_used,
+                              idx * sizeof(uint32_t),
+                              indices,
+                              node->cold->ib.obj);
+      save->current_bo_bytes_used += idx * sizeof(uint32_t);
+   } else {
+      node->cold->vertex_count = 0;
+      node->cold->prim_count = 0;
+   }
+
+   /* Prepare for DrawGallium */
+   memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
+   /* The other info fields will be updated in vbo_save_playback_vertex_list */
+   node->cold->info.index_size = 4;
+   node->cold->info.instance_count = 1;
+   node->cold->info.index.gl_bo = node->cold->ib.obj;
+   if (merged_prim_count == 1) {
+      node->cold->info.mode = merged_prims[0].mode;
+      node->start_count.start = merged_prims[0].start;
+      node->start_count.count = merged_prims[0].count;
+      node->start_count.index_bias = 0;
+      node->modes = NULL;
+   } else {
+      node->modes = malloc(merged_prim_count * sizeof(unsigned char));
+      node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
+      for (unsigned i = 0; i < merged_prim_count; i++) {
+         node->start_counts[i].start = merged_prims[i].start;
+         node->start_counts[i].count = merged_prims[i].count;
+         node->start_counts[i].index_bias = 0;
+         node->modes[i] = merged_prims[i].mode;
+      }
+   }
+   node->num_draws = merged_prim_count;
+   if (node->num_draws > 1) {
+      bool same_mode = true;
+      for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
+         same_mode = node->modes[i] == node->modes[0];
+      }
+      if (same_mode) {
+         /* All primitives use the same mode, so we can simplify a bit */
+         node->cold->info.mode = node->modes[0];
+         free(node->modes);
+         node->modes = NULL;
+      }
+   }
 
-      _glapi_set_dispatch(ctx->Exec);
+   free(indices);
+   free(tmp_indices);
+   free(merged_prims);
 
-      /* Note that the range of referenced vertices must be mapped already */
-      _vbo_loopback_vertex_list(ctx, node);
+end:
+   node->draw_begins = node->cold->prims[0].begin;
 
-      _glapi_set_dispatch(dispatch);
+   if (!save->current_bo) {
+      save->current_bo = _mesa_bufferobj_alloc(ctx, VBO_BUF_ID + 1);
+      bool success = _mesa_bufferobj_data(ctx,
+                                          GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                          VBO_SAVE_BUFFER_SIZE,
+                                          NULL,
+                                          GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT |
+                                          MESA_GALLIUM_VERTEX_STATE_STORAGE,
+                                          save->current_bo);
+      if (!success)
+         save->out_of_memory = true;
    }
 
-   /* Decide whether the storage structs are full, or can be used for
-    * the next vertex lists as well.
+   GLuint offsets[VBO_ATTRIB_MAX];
+   for (unsigned i = 0, offset = 0; i < VBO_ATTRIB_MAX; ++i) {
+      offsets[i] = offset;
+      offset += save->attrsz[i] * sizeof(GLfloat);
+   }
+   /* Create a pair of VAOs for the possible VERTEX_PROCESSING_MODEs
+    * Note that this may reuse the previous one of possible.
     */
-   if (save->vertex_store->used >
-       VBO_SAVE_BUFFER_SIZE - 16 * (save->vertex_size + 4)) {
-
-      /* Unmap old store:
-       */
-      vbo_save_unmap_vertex_store(ctx, save->vertex_store);
-
-      /* Release old reference:
-       */
-      free_vertex_store(ctx, save->vertex_store);
-      save->vertex_store = NULL;
-      /* When we have a new vbo, we will for sure need a new vao */
-      for (gl_vertex_processing_mode vpm = 0; vpm < VP_MODE_MAX; ++vpm)
-         _mesa_reference_vao(ctx, &save->VAO[vpm], NULL);
+   for (gl_vertex_processing_mode vpm = VP_MODE_FF; vpm < VP_MODE_MAX; ++vpm) {
+      /* create or reuse the vao */
+      update_vao(ctx, vpm, &save->VAO[vpm],
+                 save->current_bo, buffer_offset, stride,
+                 save->enabled, save->attrsz, save->attrtype, offsets);
+      /* Reference the vao in the dlist */
+      node->cold->VAO[vpm] = NULL;
+      _mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
+   }
+
+   /* Prepare for DrawGalliumVertexState */
+   if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
+      for (unsigned i = 0; i < VP_MODE_MAX; i++) {
+         uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
+                                    node->cold->VAO[i]->_EnabledWithMapMode;
+
+         node->state[i] =
+            ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
+                                                 node->cold->ib.obj,
+                                                 enabled_attribs);
+         node->private_refcount[i] = 0;
+         node->enabled_attribs[i] = enabled_attribs;
+      }
 
-      /* Allocate and map new store:
-       */
-      save->vertex_store = alloc_vertex_store(ctx);
-      save->buffer_ptr = vbo_save_map_vertex_store(ctx, save->vertex_store);
-      save->out_of_memory = save->buffer_ptr == NULL;
-   }
-   else {
-      /* update buffer_ptr for next vertex */
-      save->buffer_ptr = save->vertex_store->buffer_map
-         + save->vertex_store->used;
+      node->ctx = ctx;
+      node->mode = node->cold->info.mode;
+      assert(node->cold->info.index_size == 4);
    }
 
-   if (save->prim_store->used > VBO_SAVE_PRIM_SIZE - 6) {
-      save->prim_store->refcount--;
-      assert(save->prim_store->refcount != 0);
-      save->prim_store = alloc_prim_store();
+   /* Deal with GL_COMPILE_AND_EXECUTE:
+    */
+   if (ctx->ExecuteFlag) {
+      /* _vbo_loopback_vertex_list doesn't use the index buffer, so we have to
+       * use buffer_in_ram (which contains all vertices) instead of current_bo
+       * (which contains deduplicated vertices *when* UseLoopback is false).
+       *
+       * The problem is that the VAO offset is based on current_bo's layout,
+       * so we have to use a temp value.
+       */
+      struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
+      GLintptr original = vao->BufferBinding[0].Offset;
+      /* 'start_offset' has been added to all primitives 'start', so undo it here. */
+      vao->BufferBinding[0].Offset = -(GLintptr)(start_offset * stride);
+      _vbo_loopback_vertex_list(ctx, node, save->vertex_store->buffer_in_ram);
+      vao->BufferBinding[0].Offset = original;
    }
 
    /* Reset our structures for the next run of vertices:
@@ -740,16 +1002,16 @@ static void
 wrap_buffers(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   GLint i = save->prim_count - 1;
+   GLint i = save->prim_store->used - 1;
    GLenum mode;
 
-   assert(i < (GLint) save->prim_max);
+   assert(i < (GLint) save->prim_store->size);
    assert(i >= 0);
 
    /* Close off in-progress primitive.
     */
-   save->prims[i].count = (save->vert_count - save->prims[i].start);
-   mode = save->prims[i].mode;
+   save->prim_store->prims[i].count = (get_vertex_count(save) - save->prim_store->prims[i].start);
+   mode = save->prim_store->prims[i].mode;
 
    /* store the copied vertices, and allocate a new list.
     */
@@ -757,12 +1019,12 @@ wrap_buffers(struct gl_context *ctx)
 
    /* Restart interrupted primitive
     */
-   save->prims[0].mode = mode;
-   save->prims[0].begin = 0;
-   save->prims[0].end = 0;
-   save->prims[0].start = 0;
-   save->prims[0].count = 0;
-   save->prim_count = 1;
+   save->prim_store->prims[0].mode = mode;
+   save->prim_store->prims[0].begin = 0;
+   save->prim_store->prims[0].end = 0;
+   save->prim_store->prims[0].start = 0;
+   save->prim_store->prims[0].count = 0;
+   save->prim_store->used = 1;
 }
 
 
@@ -780,16 +1042,22 @@ wrap_filled_vertex(struct gl_context *ctx)
     */
    wrap_buffers(ctx);
 
+   assert(save->vertex_store->used == 0 && save->vertex_store->used == 0);
+
    /* Copy stored stored vertices to start of new list.
     */
-   assert(save->max_vert - save->vert_count > save->copied.nr);
-
    numComponents = save->copied.nr * save->vertex_size;
-   memcpy(save->buffer_ptr,
-          save->copied.buffer,
-          numComponents * sizeof(fi_type));
-   save->buffer_ptr += numComponents;
-   save->vert_count += save->copied.nr;
+
+   fi_type *buffer_ptr = save->vertex_store->buffer_in_ram;
+   if (numComponents) {
+      assert(save->copied.buffer);
+      memcpy(buffer_ptr,
+             save->copied.buffer,
+             numComponents * sizeof(fi_type));
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
+   }
+   save->vertex_store->used = numComponents;
 }
 
 
@@ -859,7 +1127,7 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
    /* Store the current run of vertices, and emit a GL_END.  Emit a
     * BEGIN in the new buffer.
     */
-   if (save->vert_count)
+   if (save->vertex_store->used)
       wrap_buffers(ctx);
    else
       assert(save->copied.nr == 0);
@@ -877,9 +1145,6 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
    save->enabled |= BITFIELD64_BIT(attr);
 
    save->vertex_size += newsz - oldsz;
-   save->max_vert = ((VBO_SAVE_BUFFER_SIZE - save->vertex_store->used) /
-                     save->vertex_size);
-   save->vert_count = 0;
 
    /* Recalculate all the attrptr[] values:
     */
@@ -905,8 +1170,10 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
     * and will need fixup at runtime.
     */
    if (save->copied.nr) {
+      assert(save->copied.buffer);
       const fi_type *data = save->copied.buffer;
-      fi_type *dest = save->buffer_map;
+      grow_vertex_storage(ctx, save->copied.nr);
+      fi_type *dest = save->vertex_store->buffer_in_ram;
 
       /* Need to note this and fix up at runtime (or loopback):
        */
@@ -921,28 +1188,43 @@ upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
             const int j = u_bit_scan64(&enabled);
             assert(save->attrsz[j]);
             if (j == attr) {
-               if (oldsz) {
-                  COPY_CLEAN_4V_TYPE_AS_UNION(dest, oldsz, data,
-                                              save->attrtype[j]);
-                  data += oldsz;
-                  dest += newsz;
-               }
-               else {
-                  COPY_SZ_4V(dest, newsz, save->current[attr]);
-                  dest += newsz;
+               int k;
+               const fi_type *src = oldsz ? data : save->current[attr];
+               int copy = oldsz ? oldsz : newsz;
+               for (k = 0; k < copy; k++)
+                  dest[k] = src[k];
+               for (; k < newsz; k++) {
+                  switch (save->attrtype[j]) {
+                     case GL_FLOAT:
+                        dest[k] = FLOAT_AS_UNION(k == 3);
+                        break;
+                     case GL_INT:
+                        dest[k] = INT_AS_UNION(k == 3);
+                        break;
+                     case GL_UNSIGNED_INT:
+                        dest[k] = UINT_AS_UNION(k == 3);
+                        break;
+                     default:
+                        dest[k] = FLOAT_AS_UNION(k == 3);
+                        assert(!"Unexpected type in upgrade_vertex");
+                        break;
+                  }
                }
-            }
-            else {
+               dest += newsz;
+               data += oldsz;
+            } else {
                GLint sz = save->attrsz[j];
-               COPY_SZ_4V(dest, sz, data);
+               for (int k = 0; k < sz; k++)
+                  dest[k] = data[k];
                data += sz;
                dest += sz;
             }
          }
       }
 
-      save->buffer_ptr = dest;
-      save->vert_count += save->copied.nr;
+      save->vertex_store->used += save->vertex_size * save->copied.nr;
+      free(save->copied.buffer);
+      save->copied.buffer = NULL;
    }
 }
 
@@ -977,6 +1259,8 @@ fixup_vertex(struct gl_context *ctx, GLuint attr,
    }
 
    save->active_sz[attr] = sz;
+
+   grow_vertex_storage(ctx, 1);
 }
 
 
@@ -1025,34 +1309,39 @@ is_vertex_position(const struct gl_context *ctx, GLuint index)
  * 3f version won't otherwise set color[3] to 1.0 -- this is the job
  * of the chooser function when switching between Color4f and Color3f.
  */
-#define ATTR_UNION(A, N, T, C, V0, V1, V2, V3)                 \
-do {                                                           \
-   struct vbo_save_context *save = &vbo_context(ctx)->save;    \
-   int sz = (sizeof(C) / sizeof(GLfloat));                     \
-                                                               \
-   if (save->active_sz[A] != N)                                        \
-      fixup_vertex(ctx, A, N * sz, T);                         \
-                                                               \
-   {                                                           \
+#define ATTR_UNION(A, N, T, C, V0, V1, V2, V3)                  \
+do {                                                            \
+   struct vbo_save_context *save = &vbo_context(ctx)->save;     \
+   int sz = (sizeof(C) / sizeof(GLfloat));                      \
+                                                                \
+   if (save->active_sz[A] != N)                                 \
+      fixup_vertex(ctx, A, N * sz, T);                          \
+                                                                \
+   {                                                            \
       C *dest = (C *)save->attrptr[A];                          \
-      if (N>0) dest[0] = V0;                                   \
-      if (N>1) dest[1] = V1;                                   \
-      if (N>2) dest[2] = V2;                                   \
-      if (N>3) dest[3] = V3;                                   \
-      save->attrtype[A] = T;                                   \
-   }                                                           \
-                                                               \
-   if ((A) == 0) {                                             \
-      GLuint i;                                                        \
-                                                               \
-      for (i = 0; i < save->vertex_size; i++)                  \
-        save->buffer_ptr[i] = save->vertex[i];                 \
-                                                               \
-      save->buffer_ptr += save->vertex_size;                   \
-                                                               \
-      if (++save->vert_count >= save->max_vert)                        \
-        wrap_filled_vertex(ctx);                               \
-   }                                                           \
+      if (N>0) dest[0] = V0;                                    \
+      if (N>1) dest[1] = V1;                                    \
+      if (N>2) dest[2] = V2;                                    \
+      if (N>3) dest[3] = V3;                                    \
+      save->attrtype[A] = T;                                    \
+   }                                                            \
+                                                                \
+   if ((A) == VBO_ATTRIB_POS) {                                 \
+      fi_type *buffer_ptr = save->vertex_store->buffer_in_ram + \
+                            save->vertex_store->used;           \
+                                                                \
+      for (int i = 0; i < save->vertex_size; i++)               \
+        buffer_ptr[i] = save->vertex[i];                        \
+                                                                \
+      save->vertex_store->used += save->vertex_size;            \
+      unsigned used_next = (save->vertex_store->used +          \
+                            save->vertex_size) * sizeof(float); \
+      if (used_next > save->vertex_store->buffer_in_ram_size) { \
+         grow_vertex_storage(ctx, get_vertex_count(save));      \
+         assert(used_next <=                                    \
+                save->vertex_store->buffer_in_ram_size);        \
+      }                                                         \
+   }                                                            \
 } while (0)
 
 #define TAG(x) _save_##x
@@ -1060,13 +1349,12 @@ do {                                                            \
 #include "vbo_attrib_tmp.h"
 
 
-
-#define MAT( ATTR, N, face, params )                   \
-do {                                                   \
-   if (face != GL_BACK)                                        \
-      MAT_ATTR( ATTR, N, params ); /* front */         \
-   if (face != GL_FRONT)                               \
-      MAT_ATTR( ATTR + 1, N, params ); /* back */      \
+#define MAT( ATTR, N, face, params )                            \
+do {                                                            \
+   if (face != GL_BACK)                                         \
+      MAT_ATTR( ATTR, N, params ); /* front */                  \
+   if (face != GL_FRONT)                                        \
+      MAT_ATTR( ATTR + 1, N, params ); /* back */               \
 } while (0)
 
 
@@ -1119,6 +1407,10 @@ _save_Materialfv(GLenum face, GLenum pname, const GLfloat *params)
 }
 
 
+static void
+vbo_init_dispatch_save_begin_end(struct gl_context *ctx);
+
+
 /* Cope with EvalCoord/CallList called within a begin/end object:
  *     -- Flush current buffer
  *     -- Fallback to opcodes for the rest of the begin/end object.
@@ -1128,11 +1420,14 @@ dlist_fallback(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   if (save->vert_count || save->prim_count) {
-      if (save->prim_count > 0) {
+   if (save->vertex_store->used || save->prim_store->used) {
+      if (save->prim_store->used > 0 && save->vertex_store->used > 0) {
+         assert(save->vertex_size);
          /* Close off in-progress primitive. */
-         GLint i = save->prim_count - 1;
-         save->prims[i].count = save->vert_count - save->prims[i].start;
+         GLint i = save->prim_store->used - 1;
+         save->prim_store->prims[i].count =
+            get_vertex_count(save) -
+            save->prim_store->prims[i].start;
       }
 
       /* Need to replay this display list with loopback,
@@ -1146,12 +1441,11 @@ dlist_fallback(struct gl_context *ctx)
 
    copy_to_current(ctx);
    reset_vertex(ctx);
-   reset_counters(ctx);
    if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
+      vbo_install_save_vtxfmt_noop(ctx);
    }
    else {
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
@@ -1232,25 +1526,22 @@ vbo_save_NotifyBegin(struct gl_context *ctx, GLenum mode,
                      bool no_current_update)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   const GLuint i = save->prim_count++;
+   const GLuint i = save->prim_store->used++;
 
    ctx->Driver.CurrentSavePrimitive = mode;
 
-   assert(i < save->prim_max);
-   save->prims[i].mode = mode & VBO_SAVE_PRIM_MODE_MASK;
-   save->prims[i].begin = 1;
-   save->prims[i].end = 0;
-   save->prims[i].start = save->vert_count;
-   save->prims[i].count = 0;
+   if (!save->prim_store || i >= save->prim_store->size) {
+      save->prim_store = realloc_prim_store(save->prim_store, i * 2);
+   }
+   save->prim_store->prims[i].mode = mode & VBO_SAVE_PRIM_MODE_MASK;
+   save->prim_store->prims[i].begin = 1;
+   save->prim_store->prims[i].end = 0;
+   save->prim_store->prims[i].start = get_vertex_count(save);
+   save->prim_store->prims[i].count = 0;
 
    save->no_current_update = no_current_update;
 
-   if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
-   }
-   else {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt);
-   }
+   vbo_init_dispatch_save_begin_end(ctx);
 
    /* We need to call vbo_save_SaveFlushVertices() if there's state change */
    ctx->Driver.SaveNeedFlush = GL_TRUE;
@@ -1262,26 +1553,21 @@ _save_End(void)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   const GLint i = save->prim_count - 1;
+   const GLint i = save->prim_store->used - 1;
 
    ctx->Driver.CurrentSavePrimitive = PRIM_OUTSIDE_BEGIN_END;
-   save->prims[i].end = 1;
-   save->prims[i].count = (save->vert_count - save->prims[i].start);
-
-   if (i == (GLint) save->prim_max - 1) {
-      compile_vertex_list(ctx);
-      assert(save->copied.nr == 0);
-   }
+   save->prim_store->prims[i].end = 1;
+   save->prim_store->prims[i].count = (get_vertex_count(save) - save->prim_store->prims[i].start);
 
    /* Swap out this vertex format while outside begin/end.  Any color,
     * etc. received between here and the next begin will be compiled
     * as opcodes.
     */
    if (save->out_of_memory) {
-      _mesa_install_save_vtxfmt(ctx, &save->vtxfmt_noop);
+      vbo_install_save_vtxfmt_noop(ctx);
    }
    else {
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
 }
 
@@ -1301,7 +1587,7 @@ _save_PrimitiveRestartNV(void)
    GET_CURRENT_CONTEXT(ctx);
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   if (save->prim_count == 0) {
+   if (save->prim_store->used == 0) {
       /* We're not inside a glBegin/End pair, so calling glPrimitiverRestartNV
        * is an error.
        */
@@ -1309,7 +1595,7 @@ _save_PrimitiveRestartNV(void)
                           "glPrimitiveRestartNV called outside glBegin/End");
    } else {
       /* get current primitive mode */
-      GLenum curPrim = save->prims[save->prim_count - 1].mode;
+      GLenum curPrim = save->prim_store->prims[save->prim_store->used - 1].mode;
       bool no_current_update = save->no_current_update;
 
       /* restart primitive */
@@ -1319,13 +1605,8 @@ _save_PrimitiveRestartNV(void)
 }
 
 
-/* Unlike the functions above, these are to be hooked into the vtxfmt
- * maintained in ctx->ListState, active when the list is known or
- * suspected to be outside any begin/end primitive.
- * Note: OBE = Outside Begin/End
- */
-static void GLAPIENTRY
-_save_OBE_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
+void GLAPIENTRY
+save_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
@@ -1339,51 +1620,44 @@ _save_OBE_Rectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2)
 }
 
 
-static void GLAPIENTRY
-_save_OBE_Rectd(GLdouble x1, GLdouble y1, GLdouble x2, GLdouble y2)
-{
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
-}
-
-static void GLAPIENTRY
-_save_OBE_Rectdv(const GLdouble *v1, const GLdouble *v2)
+void GLAPIENTRY
+save_Rectdv(const GLdouble *v1, const GLdouble *v2)
 {
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectfv(const GLfloat *v1, const GLfloat *v2)
+void GLAPIENTRY
+save_Rectfv(const GLfloat *v1, const GLfloat *v2)
 {
-   _save_OBE_Rectf(v1[0], v1[1], v2[0], v2[1]);
+   save_Rectf(v1[0], v1[1], v2[0], v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Recti(GLint x1, GLint y1, GLint x2, GLint y2)
+void GLAPIENTRY
+save_Recti(GLint x1, GLint y1, GLint x2, GLint y2)
 {
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
+   save_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectiv(const GLint *v1, const GLint *v2)
+void GLAPIENTRY
+save_Rectiv(const GLint *v1, const GLint *v2)
 {
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rects(GLshort x1, GLshort y1, GLshort x2, GLshort y2)
+void GLAPIENTRY
+save_Rects(GLshort x1, GLshort y1, GLshort x2, GLshort y2)
 {
-   _save_OBE_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
+   save_Rectf((GLfloat) x1, (GLfloat) y1, (GLfloat) x2, (GLfloat) y2);
 }
 
-static void GLAPIENTRY
-_save_OBE_Rectsv(const GLshort *v1, const GLshort *v2)
+void GLAPIENTRY
+save_Rectsv(const GLshort *v1, const GLshort *v2)
 {
-   _save_OBE_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
+   save_Rectf((GLfloat) v1[0], (GLfloat) v1[1], (GLfloat) v2[0], (GLfloat) v2[1]);
 }
 
-
-static void GLAPIENTRY
-_save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
+void GLAPIENTRY
+save_DrawArrays(GLenum mode, GLint start, GLsizei count)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_vertex_array_object *vao = ctx->Array.VAO;
@@ -1402,6 +1676,8 @@ _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
    if (save->out_of_memory)
       return;
 
+   grow_vertex_storage(ctx, count);
+
    /* Make sure to process any VBO binding changes */
    _mesa_update_state(ctx);
 
@@ -1417,9 +1693,9 @@ _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
 }
 
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawArrays(GLenum mode, const GLint *first,
-                          const GLsizei *count, GLsizei primcount)
+void GLAPIENTRY
+save_MultiDrawArrays(GLenum mode, const GLint *first,
+                      const GLsizei *count, GLsizei primcount)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint i;
@@ -1435,17 +1711,21 @@ _save_OBE_MultiDrawArrays(GLenum mode, const GLint *first,
       return;
    }
 
+   unsigned vertcount = 0;
    for (i = 0; i < primcount; i++) {
       if (count[i] < 0) {
          _mesa_compile_error(ctx, GL_INVALID_VALUE,
                              "glMultiDrawArrays(count[i]<0)");
          return;
       }
+      vertcount += count[i];
    }
 
+   grow_vertex_storage(ctx, vertcount);
+
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-         _save_OBE_DrawArrays(mode, first[i], count[i]);
+         save_DrawArrays(mode, first[i], count[i]);
       }
    }
 }
@@ -1477,9 +1757,9 @@ array_element(struct gl_context *ctx,
 /* Could do better by copying the arrays and element list intact and
  * then emitting an indexed prim at runtime.
  */
-static void GLAPIENTRY
-_save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
-                                 const GLvoid * indices, GLint basevertex)
+void GLAPIENTRY
+save_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
+                             const GLvoid * indices, GLint basevertex)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct vbo_save_context *save = &vbo_context(ctx)->save;
@@ -1505,6 +1785,8 @@ _save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
    if (save->out_of_memory)
       return;
 
+   grow_vertex_storage(ctx, count);
+
    /* Make sure to process any VBO binding changes */
    _mesa_update_state(ctx);
 
@@ -1539,16 +1821,16 @@ _save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
    _mesa_vao_unmap(ctx, vao);
 }
 
-static void GLAPIENTRY
-_save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
-                       const GLvoid * indices)
+void GLAPIENTRY
+save_DrawElements(GLenum mode, GLsizei count, GLenum type,
+                   const GLvoid * indices)
 {
-   _save_OBE_DrawElementsBaseVertex(mode, count, type, indices, 0);
+   save_DrawElementsBaseVertex(mode, count, type, indices, 0);
 }
 
 
-static void GLAPIENTRY
-_save_OBE_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
+void GLAPIENTRY
+save_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
                             GLsizei count, GLenum type,
                             const GLvoid * indices)
 {
@@ -1579,91 +1861,87 @@ _save_OBE_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
    if (save->out_of_memory)
       return;
 
-   _save_OBE_DrawElements(mode, count, type, indices);
+   save_DrawElements(mode, count, type, indices);
 }
 
+void GLAPIENTRY
+save_DrawRangeElementsBaseVertex(GLenum mode, GLuint start, GLuint end,
+                                 GLsizei count, GLenum type,
+                                 const GLvoid *indices, GLint basevertex)
+{
+   GET_CURRENT_CONTEXT(ctx);
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type,
-                            const GLvoid * const *indices, GLsizei primcount)
+   if (end < start) {
+      _mesa_compile_error(ctx, GL_INVALID_VALUE,
+                          "glDrawRangeElementsBaseVertex(end < start)");
+      return;
+   }
+
+   save_DrawElementsBaseVertex(mode, count, type, indices, basevertex);
+}
+
+void GLAPIENTRY
+save_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type,
+                       const GLvoid * const *indices, GLsizei primcount)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
    GLsizei i;
 
+   int vertcount = 0;
+   for (i = 0; i < primcount; i++) {
+      vertcount += count[i];
+   }
+   grow_vertex_storage(ctx, vertcount);
+
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-        CALL_DrawElements(dispatch, (mode, count[i], type, indices[i]));
+         CALL_DrawElements(dispatch, (mode, count[i], type, indices[i]));
       }
    }
 }
 
 
-static void GLAPIENTRY
-_save_OBE_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
-                                      GLenum type,
-                                      const GLvoid * const *indices,
-                                      GLsizei primcount,
-                                      const GLint *basevertex)
+void GLAPIENTRY
+save_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
+                                  GLenum type,
+                                  const GLvoid * const *indices,
+                                  GLsizei primcount,
+                                  const GLint *basevertex)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct _glapi_table *dispatch = ctx->CurrentServerDispatch;
    GLsizei i;
 
+   int vertcount = 0;
+   for (i = 0; i < primcount; i++) {
+      vertcount += count[i];
+   }
+   grow_vertex_storage(ctx, vertcount);
+
    for (i = 0; i < primcount; i++) {
       if (count[i] > 0) {
-        CALL_DrawElementsBaseVertex(dispatch, (mode, count[i], type,
-                                                     indices[i],
-                                                     basevertex[i]));
+         CALL_DrawElementsBaseVertex(dispatch, (mode, count[i], type,
+                                     indices[i],
+                                     basevertex[i]));
       }
    }
 }
 
 
 static void
-vtxfmt_init(struct gl_context *ctx)
+vbo_init_dispatch_save_begin_end(struct gl_context *ctx)
 {
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-   GLvertexformat *vfmt = &save->vtxfmt;
-
-#define NAME_AE(x) _ae_##x
+#define NAME_AE(x) _mesa_##x
 #define NAME_CALLLIST(x) _save_##x
 #define NAME(x) _save_##x
-#define NAME_ES(x) _save_##x##ARB
-
-#include "vbo_init_tmp.h"
-}
+#define NAME_ES(x) _save_##x
 
-
-/**
- * Initialize the dispatch table with the VBO functions for display
- * list compilation.
- */
-void
-vbo_initialize_save_dispatch(const struct gl_context *ctx,
-                             struct _glapi_table *exec)
-{
-   SET_DrawArrays(exec, _save_OBE_DrawArrays);
-   SET_MultiDrawArrays(exec, _save_OBE_MultiDrawArrays);
-   SET_DrawElements(exec, _save_OBE_DrawElements);
-   SET_DrawElementsBaseVertex(exec, _save_OBE_DrawElementsBaseVertex);
-   SET_DrawRangeElements(exec, _save_OBE_DrawRangeElements);
-   SET_MultiDrawElementsEXT(exec, _save_OBE_MultiDrawElements);
-   SET_MultiDrawElementsBaseVertex(exec, _save_OBE_MultiDrawElementsBaseVertex);
-   SET_Rectf(exec, _save_OBE_Rectf);
-   SET_Rectd(exec, _save_OBE_Rectd);
-   SET_Rectdv(exec, _save_OBE_Rectdv);
-   SET_Rectfv(exec, _save_OBE_Rectfv);
-   SET_Recti(exec, _save_OBE_Recti);
-   SET_Rectiv(exec, _save_OBE_Rectiv);
-   SET_Rects(exec, _save_OBE_Rects);
-   SET_Rectsv(exec, _save_OBE_Rectsv);
-
-   /* Note: other glDraw functins aren't compiled into display lists */
+   struct _glapi_table *tab = ctx->Save;
+   #include "api_beginend_init.h"
 }
 
 
-
 void
 vbo_save_SaveFlushVertices(struct gl_context *ctx)
 {
@@ -1674,12 +1952,11 @@ vbo_save_SaveFlushVertices(struct gl_context *ctx)
    if (ctx->Driver.CurrentSavePrimitive <= PRIM_MAX)
       return;
 
-   if (save->vert_count || save->prim_count)
+   if (save->vertex_store->used || save->prim_store->used)
       compile_vertex_list(ctx);
 
    copy_to_current(ctx);
    reset_vertex(ctx);
-   reset_counters(ctx);
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
 
@@ -1696,15 +1973,12 @@ vbo_save_NewList(struct gl_context *ctx, GLuint list, GLenum mode)
    (void) mode;
 
    if (!save->prim_store)
-      save->prim_store = alloc_prim_store();
+      save->prim_store = realloc_prim_store(NULL, 8);
 
    if (!save->vertex_store)
-      save->vertex_store = alloc_vertex_store(ctx);
-
-   save->buffer_ptr = vbo_save_map_vertex_store(ctx, save->vertex_store);
+      save->vertex_store = CALLOC_STRUCT(vbo_save_vertex_store);
 
    reset_vertex(ctx);
-   reset_counters(ctx);
    ctx->Driver.SaveNeedFlush = GL_FALSE;
 }
 
@@ -1720,11 +1994,11 @@ vbo_save_EndList(struct gl_context *ctx)
    /* EndList called inside a (saved) Begin/End pair?
     */
    if (_mesa_inside_dlist_begin_end(ctx)) {
-      if (save->prim_count > 0) {
-         GLint i = save->prim_count - 1;
+      if (save->prim_store->used > 0) {
+         GLint i = save->prim_store->used - 1;
          ctx->Driver.CurrentSavePrimitive = PRIM_OUTSIDE_BEGIN_END;
-         save->prims[i].end = 0;
-         save->prims[i].count = save->vert_count - save->prims[i].start;
+         save->prim_store->prims[i].end = 0;
+         save->prim_store->prims[i].count = get_vertex_count(save) - save->prim_store->prims[i].start;
       }
 
       /* Make sure this vertex list gets replayed by the "loopback"
@@ -1737,88 +2011,12 @@ vbo_save_EndList(struct gl_context *ctx)
        * etc. received between here and the next begin will be compiled
        * as opcodes.
        */
-      _mesa_install_save_vtxfmt(ctx, &ctx->ListState.ListVtxfmt);
+      _mesa_init_dispatch_save_begin_end(ctx);
    }
 
-   vbo_save_unmap_vertex_store(ctx, save->vertex_store);
-
    assert(save->vertex_size == 0);
 }
 
-
-/**
- * Called from the display list code when we're about to execute a
- * display list.
- */
-void
-vbo_save_BeginCallList(struct gl_context *ctx, struct gl_display_list *dlist)
-{
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-   save->replay_flags |= dlist->Flags;
-}
-
-
-/**
- * Called from the display list code when we're finished executing a
- * display list.
- */
-void
-vbo_save_EndCallList(struct gl_context *ctx)
-{
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-
-   if (ctx->ListState.CallDepth == 1)
-      save->replay_flags = 0;
-}
-
-
-/**
- * Called by display list code when a display list is being deleted.
- */
-static void
-vbo_destroy_vertex_list(struct gl_context *ctx, void *data)
-{
-   struct vbo_save_vertex_list *node = (struct vbo_save_vertex_list *) data;
-
-   for (gl_vertex_processing_mode vpm = VP_MODE_FF; vpm < VP_MODE_MAX; ++vpm)
-      _mesa_reference_vao(ctx, &node->VAO[vpm], NULL);
-
-   if (--node->prim_store->refcount == 0)
-      free(node->prim_store);
-
-   _mesa_reference_buffer_object(ctx, &node->ib.obj, NULL);
-   free(node->current_data);
-   node->current_data = NULL;
-}
-
-
-static void
-vbo_print_vertex_list(struct gl_context *ctx, void *data, FILE *f)
-{
-   struct vbo_save_vertex_list *node = (struct vbo_save_vertex_list *) data;
-   GLuint i;
-   struct gl_buffer_object *buffer = node->VAO[0]->BufferBinding[0].BufferObj;
-   const GLuint vertex_size = _vbo_save_get_stride(node)/sizeof(GLfloat);
-   (void) ctx;
-
-   fprintf(f, "VBO-VERTEX-LIST, %u vertices, %d primitives, %d vertsize, "
-           "buffer %p\n",
-           node->vertex_count, node->prim_count, vertex_size,
-           buffer);
-
-   for (i = 0; i < node->prim_count; i++) {
-      struct _mesa_prim *prim = &node->prims[i];
-      fprintf(f, "   prim %d: %s %d..%d %s %s\n",
-             i,
-             _mesa_lookup_prim_by_nr(prim->mode),
-             prim->start,
-             prim->start + prim->count,
-             (prim->begin) ? "BEGIN" : "(wrap)",
-             (prim->end) ? "END" : "(wrap)");
-   }
-}
-
-
 /**
  * Called during context creation/init.
  */
@@ -1828,7 +2026,7 @@ current_init(struct gl_context *ctx)
    struct vbo_save_context *save = &vbo_context(ctx)->save;
    GLint i;
 
-   for (i = VBO_ATTRIB_POS; i <= VBO_ATTRIB_GENERIC15; i++) {
+   for (i = VBO_ATTRIB_POS; i <= VBO_ATTRIB_EDGEFLAG; i++) {
       const GLuint j = i - VBO_ATTRIB_POS;
       assert(j < VERT_ATTRIB_MAX);
       save->currentsz[i] = &ctx->ListState.ActiveAttribSize[j];
@@ -1852,14 +2050,5 @@ vbo_save_api_init(struct vbo_save_context *save)
 {
    struct gl_context *ctx = gl_context_from_vbo_save(save);
 
-   save->opcode_vertex_list =
-      _mesa_dlist_alloc_opcode(ctx,
-                               sizeof(struct vbo_save_vertex_list),
-                               vbo_save_playback_vertex_list,
-                               vbo_destroy_vertex_list,
-                               vbo_print_vertex_list);
-
-   vtxfmt_init(ctx);
    current_init(ctx);
-   _mesa_noop_vtxfmt_init(ctx, &save->vtxfmt_noop);
 }