Freelist cleanup/streamlining
authorHallvard Furuseth <hallvard@openldap.org>
Wed, 23 Jan 2013 14:47:35 +0000 (15:47 +0100)
committerHoward Chu <hyc@openldap.org>
Fri, 15 Feb 2013 13:27:02 +0000 (13:27 +0000)
Drop unneeded definitions, redundant code.

libraries/liblmdb/mdb.c

index 68f6083..ef41458 100644 (file)
@@ -911,18 +911,6 @@ typedef struct MDB_xcursor {
        unsigned char mx_dbflag;
 } MDB_xcursor;
 
-       /** A set of pages freed by an earlier transaction. */
-typedef struct MDB_oldpages {
-       /** Usually we only read one record from the FREEDB at a time, but
-        *      in case we read more, this will chain them together.
-        */
-       struct MDB_oldpages *mo_next;
-       /**     The ID of the transaction in which these pages were freed. */
-       txnid_t         mo_txnid;
-       /** An #MDB_IDL of the pages */
-       pgno_t          mo_pages[1];    /* dynamic */
-} MDB_oldpages;
-
        /** The database environment. */
 struct MDB_env {
        HANDLE          me_fd;          /**< The main data file */
@@ -949,12 +937,10 @@ struct MDB_env {
        size_t          me_mapsize;             /**< size of the data memory map */
        off_t           me_size;                /**< current file size */
        pgno_t          me_maxpg;               /**< me_mapsize / me_psize */
-       txnid_t         me_pgfirst;             /**< ID of first old page record we used */
        txnid_t         me_pglast;              /**< ID of last old page record we used */
        MDB_dbx         *me_dbxs;               /**< array of static DB info */
        uint16_t        *me_dbflags;    /**< array of flags from MDB_db.md_flags */
-       MDB_oldpages *me_pghead;        /**< list of old page records */
-       MDB_oldpages *me_pgfree;        /**< list of page records to free */
+       pgno_t          *me_pghead;     /**< old pages reclaimed from freelist */
        pthread_key_t   me_txkey;       /**< thread-key for readers */
        MDB_page        *me_dpages;             /**< list of malloc'd blocks for re-use */
        /** IDL of pages that became unused in a write txn */
@@ -1287,7 +1273,6 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
         * after txn 3 commits, and so will be safe to re-use in txn 4.
         */
        if (txn->mt_txnid > 3) {
-
                if (!txn->mt_env->me_pghead &&
                        txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
                        /* See if there's anything in the free DB */
@@ -1298,7 +1283,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
                        txnid_t *kptr;
 
                        mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
-                       if (!txn->mt_env->me_pgfirst) {
+                       if (!txn->mt_env->me_pglast) {
                                mdb_page_search(&m2, NULL, 0);
                                leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0);
                                kptr = (txnid_t *)NODEKEY(leaf);
@@ -1335,10 +1320,9 @@ again:
                        if (oldest > last) {
                                /* It's usable, grab it.
                                 */
-                               MDB_oldpages *mop;
-                               pgno_t *idl;
+                               pgno_t *idl, *mop;
 
-                               if (!txn->mt_env->me_pgfirst) {
+                               if (!txn->mt_env->me_pglast) {
                                        mdb_node_read(txn, leaf, &data);
                                }
                                idl = (MDB_ID *) data.mv_data;
@@ -1347,26 +1331,20 @@ again:
                                 */
                                if (!idl[0]) {
                                        txn->mt_env->me_pglast = last;
-                                       if (!txn->mt_env->me_pgfirst)
-                                               txn->mt_env->me_pgfirst = last;
                                        goto again;
                                }
-                               mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
+                               mop = malloc(MDB_IDL_SIZEOF(idl));
                                if (!mop)
                                        return ENOMEM;
-                               mop->mo_next = txn->mt_env->me_pghead;
-                               mop->mo_txnid = last;
                                txn->mt_env->me_pglast = last;
-                               if (!txn->mt_env->me_pgfirst)
-                                       txn->mt_env->me_pgfirst = last;
                                txn->mt_env->me_pghead = mop;
-                               memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));
+                               memcpy(mop, idl, MDB_IDL_SIZEOF(idl));
 
 #if MDB_DEBUG > 1
                                {
                                        unsigned int i;
                                        DPRINTF("IDL read txn %zu root %zu num %zu",
-                                               mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
+                                               last, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
                                        for (i=0; i<idl[0]; i++) {
                                                DPRINTF("IDL %zu", idl[i+1]);
                                        }
@@ -1376,14 +1354,14 @@ again:
                }
 none:
                if (txn->mt_env->me_pghead) {
-                       MDB_oldpages *mop = txn->mt_env->me_pghead;
+                       pgno_t *mop = txn->mt_env->me_pghead;
                        if (num > 1) {
                                MDB_cursor m2;
                                int retry = 500, readit = 0, n2 = num-1;
                                unsigned int i, j, k;
 
                                /* If current list is too short, must fetch more and coalesce */
-                               if (mop->mo_pages[0] < (unsigned)num)
+                               if (mop[0] < (unsigned)num)
                                        readit = 1;
 
                                mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
@@ -1398,11 +1376,10 @@ none:
                                        }
                                        if (readit) {
                                                MDB_val key, data;
-                                               MDB_oldpages *mop2;
-                                               pgno_t *idl;
+                                               pgno_t *idl, *mop2;
                                                int exact;
 
-                                               last = mop->mo_txnid + 1;
+                                               last = txn->mt_env->me_pglast + 1;
 
                                                /* We haven't hit the readers list yet? */
                                                if (!oldest) {
@@ -1432,39 +1409,37 @@ none:
                                                if (rc)
                                                        return rc;
                                                idl = (MDB_ID *) data.mv_data;
-                                               mop2 = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - 2*sizeof(pgno_t) + MDB_IDL_SIZEOF(mop->mo_pages));
+                                               mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop));
                                                if (!mop2)
                                                        return ENOMEM;
                                                /* merge in sorted order */
-                                               i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j;
-                                               mop->mo_pages[0] = P_INVALID;
+                                               i = idl[0]; j = mop[0]; mop2[0] = k = i+j;
+                                               mop[0] = P_INVALID;
                                                while (i>0  || j>0) {
-                                                       if (i && idl[i] < mop->mo_pages[j])
-                                                               mop2->mo_pages[k--] = idl[i--];
+                                                       if (i && idl[i] < mop[j])
+                                                               mop2[k--] = idl[i--];
                                                        else
-                                                               mop2->mo_pages[k--] = mop->mo_pages[j--];
+                                                               mop2[k--] = mop[j--];
                                                }
                                                txn->mt_env->me_pglast = last;
-                                               mop2->mo_txnid = last;
-                                               mop2->mo_next = mop->mo_next;
                                                txn->mt_env->me_pghead = mop2;
                                                free(mop);
                                                mop = mop2;
                                                /* Keep trying to read until we have enough */
-                                               if (mop->mo_pages[0] < (unsigned)num) {
+                                               if (mop[0] < (unsigned)num) {
                                                        continue;
                                                }
                                        }
 
                                        /* current list has enough pages, but are they contiguous? */
-                                       for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) {
-                                               if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) {
-                                                       pgno = mop->mo_pages[i];
+                                       for (i=mop[0]; i>=(unsigned)num; i--) {
+                                               if (mop[i-n2] == mop[i] + n2) {
+                                                       pgno = mop[i];
                                                        i -= n2;
                                                        /* move any stragglers down */
-                                                       for (j=i+num; j<=mop->mo_pages[0]; j++)
-                                                               mop->mo_pages[i++] = mop->mo_pages[j];
-                                                       mop->mo_pages[0] -= num;
+                                                       for (j=i+num; j<=mop[0]; j++)
+                                                               mop[i++] = mop[j];
+                                                       mop[0] -= num;
                                                        break;
                                                }
                                        }
@@ -1478,17 +1453,12 @@ none:
                                } while (1);
                        } else {
                                /* peel pages off tail, so we only have to truncate the list */
-                               pgno = MDB_IDL_LAST(mop->mo_pages);
-                               mop->mo_pages[0]--;
+                               pgno = MDB_IDL_LAST(mop);
+                               mop[0]--;
                        }
-                       if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
-                               txn->mt_env->me_pghead = mop->mo_next;
-                               if (mc->mc_dbi == FREE_DBI) {
-                                       mop->mo_next = txn->mt_env->me_pgfree;
-                                       txn->mt_env->me_pgfree = mop;
-                               } else {
-                                       free(mop);
-                               }
+                       if (MDB_IDL_IS_ZERO(mop)) {
+                               txn->mt_env->me_pghead = NULL;
+                               free(mop);
                        }
                }
        }
@@ -1961,7 +1931,7 @@ mdb_txn_reset0(MDB_txn *txn)
                if (!(env->me_flags & MDB_ROFS))
                        txn->mt_u.reader->mr_txnid = (txnid_t)-1;
        } else {
-               MDB_oldpages *mop;
+               pgno_t *mop;
                MDB_page *dp;
                unsigned int i;
 
@@ -2001,11 +1971,10 @@ mdb_txn_reset0(MDB_txn *txn)
                                env->me_free_pgs = txn->mt_free_pgs;
                }
 
-               while ((mop = txn->mt_env->me_pghead)) {
-                       txn->mt_env->me_pghead = mop->mo_next;
+               if ((mop = txn->mt_env->me_pghead) != NULL) {
+                       txn->mt_env->me_pghead = NULL;
                        free(mop);
                }
-               txn->mt_env->me_pgfirst = 0;
                txn->mt_env->me_pglast = 0;
 
                env->me_txn = NULL;
@@ -2054,6 +2023,7 @@ mdb_txn_commit(MDB_txn *txn)
        MDB_page        *dp;
        MDB_env *env;
        pgno_t  next, freecnt;
+       txnid_t oldpg_txnid, id;
        MDB_cursor mc;
 
        assert(txn != NULL);
@@ -2165,10 +2135,21 @@ mdb_txn_commit(MDB_txn *txn)
                }
        }
 
+       /* Save the freelist as of this transaction to the freeDB. This
+        * can change the freelist, so keep trying until it stabilizes.
+        *
+        * env->me_pglast and the length of txn->mt_free_pgs cannot decrease.
+        * Page numbers cannot disappear from txn->mt_free_pgs.  New pages
+        * can only appear in env->me_pghead when env->me_pglast increases.
+        * Until then, the me_pghead pointer won't move but can become NULL.
+        */
+
        mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
+       oldpg_txnid = id = 0;
+       freecnt = 0;
 
        /* should only be one record now */
-       if (env->me_pghead || env->me_pgfirst) {
+       if (env->me_pghead || env->me_pglast) {
                /* make sure first page of freeDB is touched and on freelist */
                rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
                if (rc && rc != MDB_NOTFOUND) {
@@ -2179,28 +2160,27 @@ fail:
        }
 
        /* Delete IDLs we used from the free list */
-       if (env->me_pgfirst) {
-               txnid_t cur;
+       if (env->me_pglast) {
                MDB_val key;
-               int exact = 0;
-
-               key.mv_size = sizeof(cur);
-               for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) {
-                       key.mv_data = &cur;
 
-                       mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact);
+               do {
+free_pgfirst:
+                       rc = mdb_cursor_first(&mc, &key, NULL);
+                       if (rc)
+                               goto fail;
+                       oldpg_txnid = *(txnid_t *)key.mv_data;
+again:
+                       assert(oldpg_txnid <= env->me_pglast);
+                       id = 0;
                        rc = mdb_cursor_del(&mc, 0);
                        if (rc)
                                goto fail;
-               }
-               env->me_pgfirst = 0;
-               env->me_pglast = 0;
+               } while (oldpg_txnid < env->me_pglast);
        }
 
-       /* save to free list */
+       /* Save IDL of pages freed by this txn, to freeDB */
 free2:
-       freecnt = txn->mt_free_pgs[0];
-       if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
+       if (freecnt != txn->mt_free_pgs[0]) {
                MDB_val key, data;
 
                /* make sure last page of freeDB is touched and on freelist */
@@ -2225,61 +2205,50 @@ free2:
                /* write to last page of freeDB */
                key.mv_size = sizeof(pgno_t);
                key.mv_data = &txn->mt_txnid;
-               data.mv_data = txn->mt_free_pgs;
                /* The free list can still grow during this call,
-                * despite the pre-emptive touches above. So check
-                * and make sure the entire thing got written.
+                * despite the pre-emptive touches above. So retry
+                * until the reserved space remains big enough.
                 */
                do {
+                       assert(freecnt < txn->mt_free_pgs[0]);
                        freecnt = txn->mt_free_pgs[0];
                        data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
-                       mdb_midl_sort(txn->mt_free_pgs);
-                       rc = mdb_cursor_put(&mc, &key, &data, 0);
+                       rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
                        if (rc)
                                goto fail;
                } while (freecnt != txn->mt_free_pgs[0]);
+               mdb_midl_sort(txn->mt_free_pgs);
+               memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size);
+               if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id))
+                       goto free_pgfirst;      /* used up freeDB[oldpg_txnid] */
        }
-       /* should only be one record now */
-again:
+
+       /* Put back page numbers we took from freeDB but did not use */
        if (env->me_pghead) {
                MDB_val key, data;
-               MDB_oldpages *mop;
-               pgno_t orig;
-               txnid_t id;
+               pgno_t orig, *mop;
 
                mop = env->me_pghead;
-               id = mop->mo_txnid;
+               id = env->me_pglast;
                key.mv_size = sizeof(id);
                key.mv_data = &id;
-               data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
-               data.mv_data = mop->mo_pages;
-               orig = mop->mo_pages[0];
                /* These steps may grow the freelist again
                 * due to freed overflow pages...
                 */
-               rc = mdb_cursor_put(&mc, &key, &data, 0);
-               if (rc)
-                       goto fail;
-               if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) {
-                       /* could have been used again here */
-                       if (mop->mo_pages[0] != orig) {
-                               data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
-                               data.mv_data = mop->mo_pages;
-                               id = mop->mo_txnid;
-                               rc = mdb_cursor_put(&mc, &key, &data, 0);
-                               if (rc)
-                                       goto fail;
-                       }
-               } else {
-                       /* was completely used up */
-                       rc = mdb_cursor_del(&mc, 0);
+               i = 2;
+               do {
+                       orig = mop[0];
+                       data.mv_size = MDB_IDL_SIZEOF(mop);
+                       rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
                        if (rc)
                                goto fail;
-                       if (env->me_pghead)
-                               goto again;
-               }
-               env->me_pgfirst = 0;
-               env->me_pglast = 0;
+                       assert(!env->me_pghead || env->me_pglast);
+                       /* mop could have been used again here */
+                       if (id != env->me_pglast || env->me_pghead == NULL)
+                               goto again;             /* was completely used up */
+                       assert(mop == env->me_pghead && mop[0] <= orig);
+               } while (mop[0] != orig && --i);
+               memcpy(data.mv_data, mop, data.mv_size);
        }
 
        /* Check for growth of freelist again */
@@ -2291,12 +2260,6 @@ again:
                env->me_pghead = NULL;
        }
 
-       while (env->me_pgfree) {
-               MDB_oldpages *mop = env->me_pgfree;
-               env->me_pgfree = mop->mo_next;
-               free(mop);
-       }
-
        if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
                if (mdb_midl_shrink(&txn->mt_free_pgs))
                        env->me_free_pgs = txn->mt_free_pgs;
@@ -2431,6 +2394,7 @@ sync:
        }
 
 done:
+       env->me_pglast = 0;
        env->me_txn = NULL;
        if (txn->mt_numdbs > env->me_numdbs) {
                /* update the DB flags */