2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1999-2009 Oracle. All rights reserved.
12 #include "dbinc/db_page.h"
13 #include "dbinc/btree.h"
14 #include "dbinc/lock.h"
15 #include "dbinc/log.h"
17 #include "dbinc/txn.h"
19 static int __bam_compact_dups __P((DBC *,
20 PAGE **, u_int32_t, int, DB_COMPACT *, int *));
21 static int __bam_compact_int __P((DBC *,
22 DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
23 static int __bam_compact_isdone __P((DBC *, DBT *, PAGE *, int *));
24 static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
25 static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
26 static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
27 static int __bam_merge __P((DBC *,
28 DBC *, u_int32_t, DBT *, DB_COMPACT *,int *));
29 static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
30 static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
31 static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *));
32 static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
33 static int __bam_truncate_overflow __P((DBC *,
34 db_pgno_t, PAGE **, DB_COMPACT *));
35 static int __bam_truncate_page __P((DBC *, PAGE **, PAGE *, int));
36 static int __bam_truncate_root_page __P((DBC *,
37 PAGE *, u_int32_t, DB_COMPACT *));
40 static int __bam_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
41 static int __bam_savekey __P((DBC *, int, DBT *));
42 static int __bam_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
43 static int __bam_truncate_internal __P((DB *,
44 DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
49 save_data = *c_data; \
50 ret = __db_retcopy(env, \
51 &save_start, current.data, current.size, \
52 &save_start.data, &save_start.ulen); \
56 * Only restore those things that are negated by aborting the
57 * transaction. We don't restore the number of deadlocks, for example.
60 #define RESTORE_START \
62 c_data->compact_pages_free = \
63 save_data.compact_pages_free; \
64 c_data->compact_levels = save_data.compact_levels; \
65 c_data->compact_truncate = save_data.compact_truncate; \
66 ret = __db_retcopy(env, ¤t, \
67 save_start.data, save_start.size, \
68 ¤t.data, ¤t.ulen); \
72 e __bam_compact -- compact a btree.
74 * PUBLIC: int __bam_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
75 * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
78 __bam_compact(dbp, ip, txn, start, stop, c_data, flags, end)
88 DBT current, save_start;
91 u_int32_t factor, retry;
92 int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
97 u_int32_t nelems, truncated;
102 memset(¤t, 0, sizeof(current));
103 memset(&save_start, 0, sizeof(save_start));
106 have_freelist = deadlock = isdone = ret = span = 0;
109 #ifdef HAVE_FTRUNCATE
112 nelems = truncated = 0;
116 * We pass "current" to the internal routine, indicating where that
117 * routine should begin its work and expecting that it will return to
118 * us the last key that it processed.
120 if (start != NULL && (ret = __db_retcopy(env,
121 ¤t, start->data, start->size,
122 ¤t.data, ¤t.ulen)) != 0)
125 if (IS_DB_AUTO_COMMIT(dbp, txn))
129 if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
131 if (LF_ISSET(DB_FREELIST_ONLY))
132 LF_SET(DB_FREE_SPACE);
134 #ifdef HAVE_FTRUNCATE
135 /* Sort the freelist and set up the in-memory list representation. */
136 if (txn_local && (ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
139 if ((ret = __db_free_truncate(dbp, ip,
140 txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
141 LF_CLR(DB_FREE_SPACE);
145 /* If the freelist is empty and we are not filling, get out. */
146 if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
148 LF_CLR(DB_FREE_SPACE);
151 if ((ret = __bam_setup_freelist(dbp, list, nelems)) != 0) {
152 /* Someone else owns the free list. */
159 /* Commit the txn and release the meta page lock. */
160 terr: if (txn_local) {
161 if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
168 /* Save the number truncated so far, we will add what we get below. */
169 truncated = c_data->compact_pages_truncated;
170 if (LF_ISSET(DB_FREELIST_ONLY))
175 * We want factor to be the target number of free bytes on each page,
176 * so we know when to stop adding items to a page. Make sure to
177 * subtract the page overhead when computing this target. This can
178 * result in a 1-2% error on the smallest page.
179 * First figure out how many bytes we should use:
182 factor = dbp->pgsize - SIZEOF_PAGE;
183 if (c_data->compact_fillpercent != 0) {
184 factor *= c_data->compact_fillpercent;
187 /* Now convert to the number of free bytes to target. */
188 factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
190 if (c_data->compact_pages == 0)
191 c_data->compact_pages = DB_MAX_PAGES;
201 if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
204 if (c_data->compact_timeout != 0 &&
205 (ret = __txn_set_timeout(txn,
206 c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
210 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
213 if ((ret = __bam_compact_int(dbc, ¤t, stop, factor,
214 &span, c_data, &isdone)) ==
215 DB_LOCK_DEADLOCK && txn_local) {
217 * We retry on deadlock. Cancel the statistics
218 * and reset the start point to before this
222 c_data->compact_deadlock++;
226 * If we could not get a lock while holding an internal
227 * node latched, commit the current local transaction otherwise
230 if (ret == DB_LOCK_NOTGRANTED) {
231 if (txn_local || retry++ < 5)
234 ret = DB_LOCK_DEADLOCK;
238 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
241 err: if (txn_local && txn != NULL) {
242 if (ret == 0 && deadlock == 0)
243 ret = __txn_commit(txn, DB_TXN_NOSYNC);
244 else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
248 } while (ret == 0 && !isdone);
250 if (ret == 0 && end != NULL)
251 ret = __db_retcopy(env, end, current.data, current.size,
252 &end->data, &end->ulen);
253 if (current.data != NULL)
254 __os_free(env, current.data);
255 if (save_start.data != NULL)
256 __os_free(env, save_start.data);
258 #ifdef HAVE_FTRUNCATE
260 * Finish up truncation work. If there are pages left in the free
261 * list then search the internal nodes of the tree as we may have
262 * missed some while walking the leaf nodes. Then calculate how
263 * many pages we have truncated and release the in-memory free list.
265 done: if (LF_ISSET(DB_FREE_SPACE)) {
271 if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) && (t_ret =
272 __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
273 isdone = meta->free == PGNO_INVALID;
274 ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
278 ret = __bam_truncate_internal(dbp, ip, txn, c_data);
280 /* Clean up the free list. */
282 __os_free(env, list);
285 __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
286 c_data->compact_pages_truncated =
287 truncated + last_pgno - meta->last_pgno;
288 if ((t_ret = __memp_fput(dbp->mpf, ip,
289 meta, dbp->priority)) != 0 && ret == 0)
294 if (have_freelist && (t_ret =
295 __bam_free_freelist(dbp, ip, txn)) != 0 && ret == 0)
304 * __bam_csearch -- isolate search code for bam_compact.
305 * This routine hides the differences between searching
306 * a BTREE and a RECNO from the rest of the code.
308 #define CS_READ 0 /* We are just reading. */
309 #define CS_PARENT 1 /* We want the parent too, write lock. */
310 #define CS_NEXT 2 /* Get the next page. */
311 #define CS_NEXT_WRITE 3 /* Get the next page and write lock. */
312 #define CS_DEL 4 /* Get a stack to delete a page. */
313 #define CS_START 5 /* Starting level for stack, write lock. */
314 #define CS_NEXT_BOTH 6 /* Get this page and the next, write lock. */
315 #define CS_GETRECNO 0x80 /* Extract record number from start. */
318 __bam_csearch(dbc, start, sflag, level)
327 cp = (BTREE_CURSOR *)dbc->internal;
329 if (dbc->dbtype == DB_RECNO) {
330 /* If GETRECNO is not set the cp->recno is what we want. */
331 if (FLD_ISSET(sflag, CS_GETRECNO)) {
332 if (start == NULL || start->size == 0)
335 __ram_getno(dbc, start, &cp->recno, 0)) != 0)
337 FLD_CLR(sflag, CS_GETRECNO);
344 sflag = SR_PARENT | SR_READ;
354 sflag = SR_BOTH | SR_NEXT | SR_WRITE;
357 sflag = SR_PARENT | SR_WRITE;
360 return (__env_panic(dbc->env, EINVAL));
362 if ((ret = __bam_rsearch(dbc,
363 &cp->recno, sflag, level, ¬_used)) != 0)
365 /* Reset the cursor's recno to the beginning of the page. */
366 cp->recno -= cp->csp->indx;
368 FLD_CLR(sflag, CS_GETRECNO);
371 sflag = SR_READ | SR_DUPFIRST;
380 sflag = SR_NEXT | SR_WRITE;
383 sflag = SR_BOTH | SR_NEXT | SR_WRITE;
386 sflag = SR_START | SR_WRITE;
389 sflag = SR_PARENT | SR_WRITE;
392 return (__env_panic(dbc->env, EINVAL));
394 if (start == NULL || start->size == 0)
395 FLD_SET(sflag, SR_MIN);
397 if ((ret = __bam_search(dbc,
398 cp->root, start, sflag, level, NULL, ¬_used)) != 0)
406 * __bam_compact_int -- internal compaction routine.
407 * Called either with a cursor on the main database
408 * or a cursor initialized to the root of an off page duplicate
412 __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
420 BTREE_CURSOR *cp, *ncp;
423 DB_LOCK metalock, next_lock, nnext_lock, prev_lock, saved_lock;
427 PAGE *pg, *ppg, *npg;
428 db_pgno_t metapgno, npgno, nnext_pgno;
429 db_pgno_t pgno, prev_pgno, ppgno, saved_pgno;
430 db_recno_t next_recno;
431 u_int32_t sflag, pgs_free;
432 int check_dups, check_trunc, clear_root, isdone;
433 int merged, nentry, next_p, pgs_done, ret, t_ret, tdone;
436 #define CTRACE(dbc, location, t, start, f) do { \
438 DB_SET_DBT(__trace, t, strlen(t)); \
440 dbc, (dbc)->txn, location, &__trace, start, f) \
442 #define PTRACE(dbc, location, p, start, f) do { \
444 (void)snprintf(__buf, \
445 sizeof(__buf), "pgno: %lu", (u_long)p); \
446 CTRACE(dbc, location, __buf, start, f); \
449 #define CTRACE(dbc, location, t, start, f)
450 #define PTRACE(dbc, location, p, start, f)
463 metapgno = PGNO_BASE_MD;
464 LOCK_INIT(next_lock);
465 LOCK_INIT(nnext_lock);
466 LOCK_INIT(saved_lock);
468 LOCK_INIT(prev_lock);
469 check_trunc = c_data->compact_truncate != PGNO_INVALID;
470 check_dups = (!F_ISSET(dbc, DBC_OPD) &&
471 F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc;
476 cp = (BTREE_CURSOR *)dbc->internal;
477 pgs_free = c_data->compact_pages_free;
479 /* Search down the tree for the starting point. */
480 if ((ret = __bam_csearch(dbc,
481 start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) {
482 /* Its not an error to compact an empty db. */
483 if (ret == DB_NOTFOUND)
490 * Get the first leaf page. The loop below will change pg so
491 * we clear the stack reference so we don't put a a page twice.
494 cp->csp->page = NULL;
495 next_recno = cp->recno;
497 * This is the start of the main compaction loop. There are 3
498 * parts to the process:
499 * 1) Walk the leaf pages of the tree looking for a page to
500 * process. We do this with read locks. Save the
501 * key from the page and release it.
502 * 2) Set up a cursor stack which will write lock the page
503 * and enough of its ancestors to get the job done.
504 * This could go to the root if we might delete a subtree
505 * or we have record numbers to update.
506 * 3) Loop fetching pages after the above page and move enough
508 * We exit the loop if we are at the end of the leaf pages, are
509 * about to lock a new subtree (we span) or on error.
512 /* Walk the pages looking for something to fill up. */
513 while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
514 c_data->compact_pages_examine++;
515 PTRACE(dbc, "Next", PGNO(pg), start, 0);
517 /* If we have fetched the next page, get the new key. */
519 dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) {
520 if ((ret = __db_ret(dbc, pg, 0, start,
521 &start->data, &start->ulen)) != 0)
524 next_recno += NUM_ENT(pg);
525 if (P_FREESPACE(dbp, pg) > factor ||
526 (check_trunc && PGNO(pg) > c_data->compact_truncate))
528 if (stop != NULL && stop->size > 0) {
529 if ((ret = __bam_compact_isdone(dbc,
530 stop, pg, &isdone)) != 0)
537 * The page does not need more data or to be swapped,
538 * check to see if we want to look at possible duplicate
539 * trees or overflow records and the move on to the next page.
541 cp->recno += NUM_ENT(pg);
544 PTRACE(dbc, "Dups", PGNO(pg), start, 0);
545 if (check_dups && (ret = __bam_compact_dups(
546 dbc, &pg, factor, 0, c_data, &pgs_done)) != 0)
548 npgno = NEXT_PGNO(pg);
549 if ((ret = __memp_fput(dbmp,
550 dbc->thread_info, pg, dbc->priority)) != 0)
554 * If we don't do anything we don't need to hold
555 * the lock on the previous page, so couple always.
557 if ((ret = __db_lget(dbc,
558 tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE,
559 npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0)
561 if ((ret = __memp_fget(dbmp, &npgno,
562 dbc->thread_info, dbc->txn, 0, &pg)) != 0)
567 * When we get here we have 3 cases:
568 * 1) We've reached the end of the leaf linked list and are done.
569 * 2) A page whose freespace exceeds our target and therefore needs
570 * to have data added to it.
571 * 3) A page that doesn't have too much free space but needs to be
572 * checked for truncation.
573 * In both cases 2 and 3, we need that page's first key or record
574 * number. We may already have it, if not get it here.
576 if ((nentry = NUM_ENT(pg)) != 0) {
578 /* Get a copy of the first recno on the page. */
579 if (dbc->dbtype == DB_RECNO) {
580 if ((ret = __db_retcopy(dbp->env, start,
581 &cp->recno, sizeof(cp->recno),
582 &start->data, &start->ulen)) != 0)
584 } else if (start->size == 0 && (ret = __db_ret(dbc,
585 pg, 0, start, &start->data, &start->ulen)) != 0)
588 if (npgno == PGNO_INVALID) {
589 /* End of the tree, check its duplicates and exit. */
590 PTRACE(dbc, "GoDone", PGNO(pg), start, 0);
591 if (check_dups && (ret = __bam_compact_dups(dbc,
592 &pg, factor, 0, c_data, &pgs_done)) != 0)
594 c_data->compact_pages_examine++;
600 /* Release the page so we don't deadlock getting its parent. */
601 if ((ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority)) != 0)
603 if ((ret = __LPUT(dbc, cp->csp->lock)) != 0)
607 saved_pgno = PGNO_INVALID;
608 prev_pgno = PGNO_INVALID;
609 nnext_pgno = PGNO_INVALID;
612 * We must lock the metadata page first because we cannot block
613 * while holding interior nodes of the tree pinned.
616 if (!LOCK_ISSET(metalock) && pgs_free == c_data->compact_pages_free &&
617 (ret = __db_lget(dbc,
618 LCK_ALWAYS, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
622 * Setup the cursor stack. There are 3 cases:
623 * 1) the page is empty and will be deleted: nentry == 0.
624 * 2) the next page has the same parent: *spanp == 0.
625 * 3) the next page has a different parent: *spanp == 1.
627 * We now need to search the tree again, getting a write lock
628 * on the page we are going to merge or delete. We do this by
629 * searching down the tree and locking as much of the subtree
630 * above the page as needed. In the case of a delete we will
631 * find the maximal subtree that can be deleted. In the case
632 * of merge if the current page and the next page are siblings
633 * with the same parent then we only need to lock the parent.
634 * Otherwise *span will be set and we need to search to find the
635 * lowest common ancestor. Dbc will be set to contain the subtree
636 * containing the page to be merged or deleted. Ndbc will contain
637 * the minimal subtree containing that page and its next sibling.
638 * In all cases for DB_RECNO we simplify things and get the whole
639 * tree if we need more than a single parent.
640 * The tree can collapse while we don't have it locked, so the
641 * page we are looking for may be gone. If so we are at
642 * the right most end of the leaf pages and are done.
646 if (npg != NULL && (ret = __memp_fput(dbmp,
647 dbc->thread_info, npg, dbc->priority)) != 0)
651 ncp = (BTREE_CURSOR *)ndbc->internal;
652 if (clear_root == 1) {
653 ncp->sp->page = NULL;
654 LOCK_INIT(ncp->sp->lock);
656 if ((ret = __bam_stkrel(ndbc, 0)) != 0)
660 /* Case 1 -- page is empty. */
662 CTRACE(dbc, "Empty", "", start, 0);
664 sflag = CS_NEXT_WRITE;
667 if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0) {
669 if (ret == DB_NOTFOUND)
675 /* Check to see if the page is still empty. */
676 if (NUM_ENT(pg) != 0)
679 npgno = NEXT_PGNO(pg);
680 /* If this is now the root, we are very done. */
681 if (PGNO(pg) == cp->root)
684 if (npgno != PGNO_INVALID) {
685 TRY_LOCK(dbc, npgno, saved_pgno,
686 next_lock, DB_LOCK_WRITE, retry);
690 if (PREV_PGNO(pg) != PGNO_INVALID) {
691 TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
692 prev_lock, DB_LOCK_WRITE, retry);
697 __bam_dpages(dbc, 0, BTD_RELINK)) != 0)
699 c_data->compact_pages_free++;
700 if ((ret = __TLPUT(dbc, prev_lock)) != 0)
702 LOCK_INIT(prev_lock);
703 if ((ret = __TLPUT(dbc, next_lock)) != 0)
705 LOCK_INIT(next_lock);
706 goto next_no_release;
712 /* case 3 -- different parents. */
714 CTRACE(dbc, "Span", "", start, 0);
716 * Search the tree looking for the page containing and
717 * the next page after the current key.
718 * The stack will be rooted at the page that spans
719 * the current and next pages. The two subtrees
720 * are returned below that. For BTREE the current
721 * page subtreee will be first while for RECNO the
722 * next page subtree will be first
724 if (ndbc == NULL && (ret = __dbc_dup(dbc, &ndbc, 0)) != 0)
726 ncp = (BTREE_CURSOR *)ndbc->internal;
728 ncp->recno = cp->recno;
729 cp->recno = next_recno;
731 if ((ret = __bam_csearch(dbc, start, CS_NEXT_BOTH, 0)) != 0) {
732 if (ret == DB_NOTFOUND) {
740 * Find the top of the stack for the second subtree.
742 for (epg = cp->csp - 1; epg > cp->sp; epg--)
743 if (LEVEL(epg->page) == LEAFLEVEL)
745 DB_ASSERT(env, epg != cp->sp);
748 * Copy the root. We will have two instances of the
749 * same page, be careful not to free both.
751 BT_STK_PUSH(env, ncp, cp->sp->page, cp->sp->indx,
752 cp->sp->lock, cp->sp->lock_mode, ret);
757 /* Copy the stack containing the next page. */
758 for (epg++; epg <= cp->csp; epg++) {
759 BT_STK_PUSH(env, ncp, epg->page, epg->indx,
760 epg->lock, epg->lock_mode, ret);
764 /* adjust the stack pointer to remove these items. */
766 cp->csp -= ncp->csp - ncp->sp;
769 * If this is RECNO then we want to swap the stacks.
771 if (dbp->type == DB_RECNO) {
772 ndbc->internal = (DBC_INTERNAL *)cp;
773 dbc->internal = (DBC_INTERNAL *)ncp;
775 ncp = (BTREE_CURSOR *)ndbc->internal;
781 NEXT_PGNO(cp->csp->page) == PGNO(ncp->csp->page));
785 * The page may have emptied while we waited for the
786 * lock or the record we are looking for may have
788 * Reset npgno so we re-get this page when we go back
791 if (NUM_ENT(pg) == 0 ||
792 (dbc->dbtype == DB_RECNO &&
793 NEXT_PGNO(cp->csp->page) != PGNO(ncp->csp->page))) {
799 if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
800 if (PREV_PGNO(pg) != PGNO_INVALID) {
801 TRY_LOCK2(dbc, ndbc, PREV_PGNO(pg), prev_pgno,
802 prev_lock, DB_LOCK_WRITE, retry);
807 /* Get a fresh low numbered page. */
808 if ((ret = __bam_truncate_page(dbc,
809 &pg, ncp->csp->page, 1)) != 0)
811 if ((ret = __TLPUT(dbc, prev_lock)) != 0)
813 LOCK_INIT(prev_lock);
816 PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0);
817 if (check_dups && (ret = __bam_compact_dups(ndbc,
818 &ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
821 /* Check to see if the tree collapsed. */
822 if (PGNO(ncp->csp->page) == ncp->root)
826 npgno = NEXT_PGNO(pg);
827 PTRACE(dbc, "SDups", PGNO(pg), start, 0);
828 if (check_dups && (ret =
829 __bam_compact_dups(dbc, &cp->csp->page,
830 factor, 1, c_data, &pgs_done)) != 0)
834 * We may have dropped our locks, check again
835 * to see if we still need to fill this page and
836 * we are in a spanning situation.
839 if (P_FREESPACE(dbp, pg) <= factor ||
840 cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1)
844 * Try to move things into a single parent.
847 for (epg = cp->sp; epg != cp->csp; epg++) {
848 PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
849 if ((ret = __bam_merge_internal(dbc,
850 ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
856 if (ret != 0 && ret != DB_LOCK_NOTGRANTED)
859 * If we merged the parent, then we no longer span.
860 * Otherwise if we tried to merge the parent but would
861 * block on one of the other leaf pages try again.
862 * If we did not merge any records of the parent,
863 * exit to commit any local transactions and try again.
865 if (merged || ret == DB_LOCK_NOTGRANTED) {
870 if (cp->csp->page == NULL)
873 next_recno = cp->recno;
876 PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
878 /* if we remove the next page, then we need its next locked */
879 npgno = NEXT_PGNO(ncp->csp->page);
880 if (npgno != PGNO_INVALID) {
881 TRY_LOCK2(dbc, ndbc, npgno,
882 nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
886 if ((ret = __bam_merge(dbc,
887 ndbc, factor, stop, c_data, &isdone)) != 0)
891 * __bam_merge could have freed our stack if it
892 * deleted a page possibly collapsing the tree.
894 if (cp->csp->page == NULL)
896 cp->recno += NUM_ENT(pg);
898 if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
900 LOCK_INIT(nnext_lock);
902 /* If we did not bump to the next page something did not fit. */
903 if (npgno != NEXT_PGNO(pg)) {
904 npgno = NEXT_PGNO(pg);
908 /* Case 2 -- same parents. */
909 CTRACE(dbc, "Sib", "", start, 0);
911 __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0) {
912 if (ret == DB_NOTFOUND) {
920 DB_ASSERT(env, IS_DIRTY(pg));
922 PGNO(pg) == cp->root || IS_DIRTY(cp->csp[-1].page));
924 /* We now have a write lock, recheck the page. */
925 if ((nentry = NUM_ENT(pg)) == 0) {
930 /* Check duplicate trees, we have a write lock on the page. */
931 PTRACE(dbc, "SibDup", PGNO(pg), start, 0);
932 if (check_dups && (ret =
933 __bam_compact_dups(dbc, &cp->csp->page,
934 factor, 1, c_data, &pgs_done)) != 0)
937 npgno = NEXT_PGNO(pg);
939 /* Check to see if the tree collapsed. */
940 if (PGNO(pg) == cp->root)
942 DB_ASSERT(env, cp->csp - cp->sp == 1);
944 /* After re-locking check to see if we still need to fill. */
945 if (P_FREESPACE(dbp, pg) <= factor) {
947 PGNO(pg) > c_data->compact_truncate) {
948 if (PREV_PGNO(pg) != PGNO_INVALID) {
949 TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
950 prev_lock, DB_LOCK_WRITE, retry);
954 if (npgno != PGNO_INVALID) {
955 TRY_LOCK(dbc, npgno, saved_pgno,
956 next_lock, DB_LOCK_WRITE, retry);
961 /* Get a fresh low numbered page. */
962 if ((ret = __bam_truncate_page(dbc,
965 if ((ret = __TLPUT(dbc, prev_lock)) != 0)
967 LOCK_INIT(prev_lock);
968 if ((ret = __TLPUT(dbc, next_lock)) != 0)
970 LOCK_INIT(next_lock);
975 /* If they have the same parent, just dup the cursor */
976 if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
978 if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
980 ncp = (BTREE_CURSOR *)ndbc->internal;
983 * ncp->recno needs to have the recno of the next page.
984 * Bump it by the number of records on the current page.
986 ncp->recno += NUM_ENT(pg);
989 pgno = PGNO(cp->csp->page);
990 ppgno = PGNO(cp->csp[-1].page);
991 /* Fetch pages until we fill this one. */
992 while (!isdone && npgno != PGNO_INVALID &&
993 P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) {
995 * merging may have to free the parent page, if it does,
996 * refetch it but do it decending the tree.
999 if ((ppg = epg->page) == NULL) {
1000 if ((ret = __memp_fput(dbmp, dbc->thread_info,
1001 cp->csp->page, dbc->priority)) != 0)
1004 if ((ret = __memp_fget(dbmp, &ppgno, dbc->thread_info,
1005 dbc->txn, DB_MPOOL_DIRTY, &ppg)) != 0)
1007 if ((ret = __memp_fget(dbmp, &pgno, dbc->thread_info,
1008 dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
1015 * If our current position is the last one on a parent
1016 * page, then we are about to merge across different
1017 * internal nodes. Thus, we need to lock higher up
1018 * in the tree. We will exit the routine and commit
1019 * what we have done so far. Set spanp so we know
1020 * we are in this case when we come back.
1022 if (epg->indx == NUM_ENT(ppg) - 1) {
1025 next_recno = cp->recno;
1030 /* Lock and get the next page. */
1031 TRY_LOCK(dbc, npgno,
1032 saved_pgno, saved_lock, DB_LOCK_WRITE, retry);
1035 if ((ret = __LPUT(dbc, ncp->lock)) != 0)
1037 ncp->lock = saved_lock;
1038 LOCK_INIT(saved_lock);
1039 saved_pgno = PGNO_INVALID;
1041 if ((ret = __memp_fget(dbmp, &npgno,
1042 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &npg)) != 0)
1046 PGNO(pg) > c_data->compact_truncate) {
1047 if (PREV_PGNO(pg) != PGNO_INVALID) {
1048 TRY_LOCK(dbc, PREV_PGNO(pg),
1049 prev_pgno, prev_lock, DB_LOCK_WRITE, retry);
1054 /* Get a fresh low numbered page. */
1055 if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0)
1057 if ((ret = __TLPUT(dbc, prev_lock)) != 0)
1059 LOCK_INIT(prev_lock);
1062 c_data->compact_pages_examine++;
1064 PTRACE(dbc, "MDups", PGNO(npg), start, 0);
1065 if (check_dups && (ret = __bam_compact_dups(ndbc,
1066 &npg, factor, 1, c_data, &pgs_done)) != 0)
1069 npgno = NEXT_PGNO(npg);
1070 if (npgno != PGNO_INVALID) {
1071 TRY_LOCK(dbc, npgno,
1072 nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
1077 /* copy the common parent to the stack. */
1078 BT_STK_PUSH(env, ncp, ppg,
1079 epg->indx + 1, epg->lock, epg->lock_mode, ret);
1083 /* Put the page on the stack. */
1084 BT_STK_ENTER(env, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret);
1086 LOCK_INIT(ncp->lock);
1090 * Merge the pages. This will either free the next
1091 * page or just update its parent pointer.
1093 PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
1094 if ((ret = __bam_merge(dbc,
1095 ndbc, factor, stop, c_data, &isdone)) != 0)
1100 if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
1102 LOCK_INIT(nnext_lock);
1105 * __bam_merge could have freed our stack if it
1106 * deleted a page possibly collapsing the tree.
1108 if (cp->csp->page == NULL)
1110 /* If we did not bump to the next page something did not fit. */
1111 if (npgno != NEXT_PGNO(pg))
1115 /* Bottom of the main loop. Move to the next page. */
1116 npgno = NEXT_PGNO(pg);
1117 cp->recno += NUM_ENT(pg);
1118 next_recno = cp->recno;
1122 ncp = (BTREE_CURSOR *)ndbc->internal;
1123 if (ncp->sp->page == cp->sp->page) {
1124 ncp->sp->page = NULL;
1125 LOCK_INIT(ncp->sp->lock);
1127 if ((ret = __bam_stkrel(ndbc,
1128 pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
1132 * Unlatch the tree before trying to lock the next page. We must
1133 * unlatch to avoid a latch deadlock but we want to hold the
1134 * lock on the parent node so this leaf cannot be unlinked.
1137 if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
1139 if ((ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
1141 if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
1143 if ((ret = __TLPUT(dbc, saved_lock)) != 0)
1145 if ((ret = __TLPUT(dbc, prev_lock)) != 0)
1151 if (npgno == PGNO_INVALID || c_data->compact_pages == 0)
1155 * If we are at the end of this parent commit the
1156 * transaction so we don't tie things up.
1158 if (pgs_done != 0 && *spanp) {
1159 deleted: if (((ret = __bam_stkrel(ndbc, 0)) != 0 ||
1160 (ret = __dbc_close(ndbc)) != 0))
1166 /* Reget the next page to look at. */
1167 cp->recno = next_recno;
1168 if ((ret = __memp_fget(dbmp, &npgno,
1169 dbc->thread_info, dbc->txn, 0, &pg)) != 0)
1171 cp->csp->lock = next_lock;
1172 LOCK_INIT(next_lock);
1174 /* If we did not do anything we can drop the metalock. */
1175 if (pgs_done == 0 && (ret = __LPUT(dbc, metalock)) != 0)
1183 * We come here if pg came from cp->csp->page and could
1184 * have already been fput.
1189 * Don't release locks (STK_PGONLY)if we had an error, we could reveal
1190 * a bad tree to a dirty reader. Wait till the abort to free the locks.
1193 if (dbc->txn != NULL && ret != 0)
1194 sflag |= STK_PGONLY;
1196 ncp = (BTREE_CURSOR *)ndbc->internal;
1197 if (npg == ncp->csp->page)
1199 if (ncp->sp->page == cp->sp->page) {
1200 ncp->sp->page = NULL;
1201 LOCK_INIT(ncp->sp->lock);
1203 if ((t_ret = __bam_stkrel(ndbc, sflag)) != 0 && ret == 0)
1205 else if ((t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
1208 if (pg == cp->csp->page)
1210 if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
1213 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
1216 if (pg != NULL && (t_ret =
1218 dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
1220 if (npg != NULL && (t_ret =
1222 dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
1231 * __bam_merge -- do actual merging of leaf pages.
1234 __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
1241 BTREE_CURSOR *cp, *ncp;
1247 DB_ASSERT(NULL, dbc != NULL);
1248 DB_ASSERT(NULL, ndbc != NULL);
1250 cp = (BTREE_CURSOR *)dbc->internal;
1251 ncp = (BTREE_CURSOR *)ndbc->internal;
1253 npg = ncp->csp->page;
1255 nent = NUM_ENT(npg);
1257 /* If the page is empty just throw it away. */
1261 /* Find if the stopping point is on this page. */
1262 if (stop != NULL && stop->size != 0) {
1263 if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
1270 * If there is too much data then just move records one at a time.
1271 * Otherwise copy the data space over and fix up the index table.
1272 * If we are on the left most child we will effect our parent's
1273 * index entry so we call merge_records to figure out key sizes.
1275 if ((dbc->dbtype == DB_BTREE &&
1276 ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
1277 (int)(P_FREESPACE(dbp, pg) -
1278 ((dbp->pgsize - P_OVERHEAD(dbp)) -
1279 P_FREESPACE(dbp, npg))) < (int)factor)
1280 ret = __bam_merge_records(dbc, ndbc, factor, c_data);
1282 free_page: ret = __bam_merge_pages(dbc, ndbc, c_data);
1288 __bam_merge_records(dbc, ndbc, factor, c_data)
1294 BKEYDATA *bk, *tmp_bk;
1296 BTREE_CURSOR *cp, *ncp;
1298 DBT a, b, data, hdr;
1302 db_indx_t adj, indx, nent, *ninp, pind;
1304 u_int32_t freespace, nksize, pfree, size;
1305 int first_dup, is_dup, next_dup, n_ok, ret;
1306 size_t (*func) __P((DB *, const DBT *, const DBT *));
1310 t = dbp->bt_internal;
1311 cp = (BTREE_CURSOR *)dbc->internal;
1312 ncp = (BTREE_CURSOR *)ndbc->internal;
1314 npg = ncp->csp->page;
1315 memset(&hdr, 0, sizeof(hdr));
1320 nent = NUM_ENT(npg);
1322 DB_ASSERT(env, nent != 0);
1324 /* See if we want to swap out this page. */
1325 if (c_data->compact_truncate != PGNO_INVALID &&
1326 PGNO(npg) > c_data->compact_truncate) {
1327 /* Get a fresh low numbered page. */
1328 if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0)
1332 ninp = P_INP(dbp, npg);
1335 * pg is the page that is being filled, it is in the stack in cp.
1336 * npg is the next page, it is in the stack in ncp.
1338 freespace = P_FREESPACE(dbp, pg);
1340 adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
1342 * Loop through the records and find the stopping point.
1344 for (indx = 0; indx < nent; indx += adj) {
1345 bk = GET_BKEYDATA(dbp, npg, indx);
1347 /* Size of the key. */
1348 size = BITEM_PSIZE(bk);
1350 /* Size of the data. */
1351 if (TYPE(pg) == P_LBTREE)
1352 size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1));
1354 * If we are at a duplicate set, skip ahead to see and
1355 * get the total size for the group.
1358 if (TYPE(pg) == P_LBTREE &&
1359 indx < nent - adj &&
1360 ninp[indx] == ninp[indx + adj]) {
1362 /* Size of index for key reference. */
1363 size += sizeof(db_indx_t);
1365 /* Size of data item. */
1366 size += BITEM_PSIZE(
1367 GET_BKEYDATA(dbp, npg, indx + n_ok));
1369 } while (indx + n_ok < nent &&
1370 ninp[indx] == ninp[indx + n_ok]);
1372 /* if the next set will not fit on the page we are done. */
1373 if (freespace < size)
1377 * Otherwise figure out if we are past the goal and if
1378 * adding this set will put us closer to the goal than
1381 if ((freespace - size) < factor) {
1382 if (freespace - factor > factor - (freespace - size))
1390 /* If we have hit the first record then there is nothing we can move. */
1393 if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
1395 return (__bam_merge_pages(dbc, ndbc, c_data));
1399 * We need to update npg's parent key. Avoid creating a new key
1400 * that will be too big. Get what space will be available on the
1401 * parents. Then if there will not be room for this key, see if
1402 * prefix compression will make it work, if not backup till we
1403 * find something that will. (Needless to say, this is a very
1404 * unlikely event.) If we are deleting this page then we will
1405 * need to propagate the next key to our grand parents, so we
1406 * see if that will fit.
1408 pfree = dbp->pgsize;
1409 for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--)
1410 if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
1411 bi = GET_BINTERNAL(dbp, epg->page, epg->indx);
1412 /* Add back in the key we will be deleting. */
1413 freespace += BINTERNAL_PSIZE(bi->len);
1414 if (freespace < pfree)
1421 * If we are at the end, we will delete this page. We need to
1422 * check the next parent key only if we are the leftmost page and
1423 * will therefore have to propagate the key up the tree.
1426 if (ncp->csp[-1].indx != 0 || ncp->csp[-1].entries == 1 ||
1427 BINTERNAL_PSIZE(GET_BINTERNAL(dbp,
1428 ncp->csp[-1].page, 1)->len) <= pfree)
1429 return (__bam_merge_pages(dbc, ndbc, c_data));
1432 bk = GET_BKEYDATA(dbp, npg, indx);
1433 if (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
1434 if (F_ISSET(dbc, DBC_OPD)) {
1435 if (dbp->dup_compare == __bam_defcmp)
1436 func = __bam_defpfx;
1440 func = t->bt_prefix;
1444 /* Skip to the beginning of a duplicate set. */
1445 while (indx != 0 && ninp[indx] == ninp[indx - adj])
1448 while (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
1449 if (B_TYPE(bk->type) != B_KEYDATA)
1452 * Figure out if we can truncate this key.
1453 * Code borrowed from bt_split.c
1457 tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj);
1458 if (B_TYPE(tmp_bk->type) != B_KEYDATA)
1460 memset(&a, 0, sizeof(a));
1461 a.size = tmp_bk->len;
1462 a.data = tmp_bk->data;
1463 memset(&b, 0, sizeof(b));
1466 nksize = (u_int32_t)func(dbp, &a, &b);
1467 if (BINTERNAL_PSIZE(nksize) < pfree)
1470 /* Skip to the beginning of a duplicate set. */
1473 } while (indx != 0 && ninp[indx] == ninp[indx - adj]);
1475 bk = GET_BKEYDATA(dbp, npg, indx);
1479 * indx references the first record that will not move to the previous
1480 * page. If it is 0 then we could not find a key that would fit in
1481 * the parent that would permit us to move any records.
1485 DB_ASSERT(env, indx <= nent);
1487 /* Loop through the records and move them from npg to pg. */
1488 no_check: is_dup = first_dup = next_dup = 0;
1490 npg = ncp->csp->page;
1491 DB_ASSERT(env, IS_DIRTY(pg));
1492 DB_ASSERT(env, IS_DIRTY(npg));
1493 ninp = P_INP(dbp, npg);
1495 bk = GET_BKEYDATA(dbp, npg, 0);
1496 /* Figure out if we are in a duplicate group or not. */
1497 if ((NUM_ENT(npg) % 2) == 0) {
1498 if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) {
1506 } else if (next_dup) {
1514 if (is_dup && !first_dup && (pind % 2) == 0) {
1515 /* Duplicate key. */
1516 if ((ret = __bam_adjindx(dbc,
1517 pg, pind, pind - P_INDX, 1)) != 0)
1521 } else switch (B_TYPE(bk->type)) {
1524 hdr.size = SSZA(BKEYDATA, data);
1525 data.size = bk->len;
1526 data.data = bk->data;
1527 if ((ret = __db_pitem(dbc, pg, pind,
1528 BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0)
1533 data.size = BOVERFLOW_SIZE;
1535 if ((ret = __db_pitem(dbc, pg, pind,
1536 BOVERFLOW_SIZE, &data, NULL)) != 0)
1541 "Unknown record format, page %lu, indx 0",
1547 if (next_dup && (NUM_ENT(npg) % 2) == 0) {
1548 if ((ret = __bam_adjindx(ndbc,
1549 npg, 0, O_INDX, 0)) != 0)
1552 if ((ret = __db_ditem(ndbc,
1553 npg, 0, BITEM_SIZE(bk))) != 0)
1557 } while (--indx != 0);
1559 DB_ASSERT(env, NUM_ENT(npg) != 0);
1562 (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) {
1563 if (TYPE(pg) == P_LBTREE)
1565 if ((ret = __bam_adjust(ndbc, -adjust)) != 0)
1568 if ((ret = __bam_adjust(dbc, adjust)) != 0)
1572 /* Update parent with new key. */
1573 if (ndbc->dbtype == DB_BTREE &&
1574 (ret = __bam_pupdate(ndbc, pg)) != 0)
1577 done: if (cp->sp->page == ncp->sp->page) {
1578 cp->sp->page = NULL;
1579 LOCK_INIT(cp->sp->lock);
1581 ret = __bam_stkrel(ndbc, STK_CLRDBC);
1587 __bam_merge_pages(dbc, ndbc, c_data)
1591 BTREE_CURSOR *cp, *ncp;
1596 db_indx_t nent, *ninp, *pinp;
1602 COMPQUIET(ppgno, PGNO_INVALID);
1605 cp = (BTREE_CURSOR *)dbc->internal;
1606 ncp = (BTREE_CURSOR *)ndbc->internal;
1608 npg = ncp->csp->page;
1609 memset(&hdr, 0, sizeof(hdr));
1610 nent = NUM_ENT(npg);
1612 /* If the page is empty just throw it away. */
1617 npg = ncp->csp->page;
1618 DB_ASSERT(dbp->env, IS_DIRTY(pg));
1619 DB_ASSERT(dbp->env, IS_DIRTY(npg));
1620 DB_ASSERT(dbp->env, nent == NUM_ENT(npg));
1622 /* Bulk copy the data to the new page. */
1623 len = dbp->pgsize - HOFFSET(npg);
1624 if (DBC_LOGGING(dbc)) {
1625 memset(&hdr, 0, sizeof(hdr));
1627 hdr.size = LOFFSET(dbp, npg);
1628 memset(&data, 0, sizeof(data));
1629 data.data = (u_int8_t *)npg + HOFFSET(npg);
1631 if ((ret = __bam_merge_log(dbp,
1632 dbc->txn, &LSN(pg), 0, PGNO(pg),
1633 &LSN(pg), PGNO(npg), &LSN(npg), &hdr, &data, 0)) != 0)
1636 LSN_NOT_LOGGED(LSN(pg));
1638 bp = (u_int8_t *)pg + HOFFSET(pg) - len;
1639 memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len);
1641 /* Copy index table offset by what was there already. */
1642 pinp = P_INP(dbp, pg) + NUM_ENT(pg);
1643 ninp = P_INP(dbp, npg);
1644 for (i = 0; i < NUM_ENT(npg); i++)
1645 *pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg));
1650 HOFFSET(npg) += len;
1652 if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) {
1654 * There are two cases here regarding the stack.
1655 * Either we have two two level stacks but only ndbc
1656 * references the parent page or we have a multilevel
1657 * stack and only ndbc has an entry for the spanning
1660 if (TYPE(pg) == P_LBTREE)
1662 if ((ret = __bam_adjust(ndbc, -i)) != 0)
1665 if ((ret = __bam_adjust(dbc, i)) != 0)
1671 * __bam_dpages may decide to collapse the tree.
1672 * This can happen if we have the root and there
1673 * are exactly 2 pointers left in it.
1674 * If it can collapse the tree we must free the other
1675 * stack since it will nolonger be valid. This
1676 * must be done before hand because we cannot
1677 * hold a page pinned if it might be truncated.
1679 if ((ret = __bam_relink(dbc,
1680 ncp->csp->page, cp->csp->page, PGNO_INVALID)) != 0)
1682 /* Drop the duplicate reference to the sub tree root. */
1683 cp->sp->page = NULL;
1684 LOCK_INIT(cp->sp->lock);
1685 if (PGNO(ncp->sp->page) == ncp->root &&
1686 NUM_ENT(ncp->sp->page) == 2) {
1687 if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0)
1689 level = LEVEL(ncp->sp->page);
1690 ppgno = PGNO(ncp->csp[-1].page);
1693 if (c_data->compact_truncate > PGNO(npg))
1694 c_data->compact_truncate--;
1695 if ((ret = __bam_dpages(ndbc,
1696 0, ndbc->dbtype == DB_RECNO ? 0 : BTD_UPDATE)) != 0)
1699 c_data->compact_pages_free++;
1700 c_data->compact_pages--;
1702 if ((ret = __memp_fget(dbmp, &ncp->root,
1703 dbc->thread_info, dbc->txn, 0, &npg)) != 0)
1705 if (level == LEVEL(npg))
1707 if ((ret = __memp_fput(dbmp,
1708 dbc->thread_info, npg, dbc->priority)) != 0)
1712 c_data->compact_levels++;
1713 c_data->compact_pages_free++;
1714 if (c_data->compact_truncate > ppgno)
1715 c_data->compact_truncate--;
1716 if (c_data->compact_pages != 0)
1717 c_data->compact_pages--;
1725 * __bam_merge_internal --
1726 * Merge internal nodes of the tree.
1729 __bam_merge_internal(dbc, ndbc, level, c_data, merged)
1735 BINTERNAL bi, *bip, *fip;
1736 BTREE_CURSOR *cp, *ncp;
1740 EPG *epg, *save_csp, *nsave_csp;
1743 db_indx_t first, indx, pind;
1745 int32_t nrecs, trecs;
1747 u_int32_t freespace, pfree;
1750 COMPQUIET(bip, NULL);
1751 COMPQUIET(ppgno, PGNO_INVALID);
1752 DB_ASSERT(NULL, dbc != NULL);
1753 DB_ASSERT(NULL, ndbc != NULL);
1756 * ndbc will contain the the dominating parent of the subtree.
1757 * dbc will have the tree containing the left child.
1759 * The stacks descend to the leaf level.
1760 * If this is a recno tree then both stacks will start at the root.
1764 cp = (BTREE_CURSOR *)dbc->internal;
1765 ncp = (BTREE_CURSOR *)ndbc->internal;
1770 * Set the stacks to the level requested.
1771 * Save the old value to restore when we exit.
1774 cp->csp = &cp->csp[-level + 1];
1778 nsave_csp = ncp->csp;
1779 ncp->csp = &ncp->csp[-level + 1];
1780 npg = ncp->csp->page;
1781 indx = NUM_ENT(npg);
1784 * The caller may have two stacks that include common ancestors, we
1785 * check here for convenience.
1790 if (TYPE(pg) == P_IBTREE) {
1792 * Check for overflow keys on both pages while we have
1796 __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
1799 __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
1804 * If we are about to move data off the left most page of an
1805 * internal node we will need to update its parents, make sure there
1806 * will be room for the new key on all the parents in the stack.
1807 * If not, move less data.
1810 if (TYPE(pg) == P_IBTREE) {
1811 /* See where we run out of space. */
1812 freespace = P_FREESPACE(dbp, pg);
1814 * The leftmost key of an internal page is not accurate.
1815 * Go up the tree to find a non-leftmost parent.
1818 while (--epg >= ncp->sp && epg->indx == 0)
1820 fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
1824 size = BINTERNAL_PSIZE(bip->len);
1825 if (size > freespace)
1828 if (++indx >= NUM_ENT(npg))
1830 bip = GET_BINTERNAL(dbp, npg, indx);
1833 /* See if we are deleting the page and we are not left most. */
1834 if (indx == NUM_ENT(npg) && epg[-1].indx != 0)
1837 pfree = dbp->pgsize;
1838 for (epg--; epg >= ncp->sp; epg--)
1839 if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
1840 bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
1841 /* Add back in the key we will be deleting. */
1842 freespace += BINTERNAL_PSIZE(bip->len);
1843 if (freespace < pfree)
1850 /* If we are at the end of the page we will delete it. */
1851 if (indx == NUM_ENT(npg)) {
1852 if (NUM_ENT(epg[-1].page) == 1)
1855 GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1);
1857 bip = GET_BINTERNAL(dbp, npg, indx);
1859 /* Back up until we have a key that fits. */
1860 while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) {
1862 bip = GET_BINTERNAL(dbp, npg, indx);
1868 fits: memset(&bi, 0, sizeof(bi));
1869 memset(&hdr, 0, sizeof(hdr));
1870 memset(&data, 0, sizeof(data));
1874 * Copy data between internal nodes till one is full
1875 * or the other is empty.
1880 if (dbc->dbtype == DB_BTREE) {
1881 bip = GET_BINTERNAL(dbp, npg, 0);
1882 size = fip == NULL ?
1883 BINTERNAL_SIZE(bip->len) :
1884 BINTERNAL_SIZE(fip->len);
1885 if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
1889 data.size = bip->len;
1890 data.data = bip->data;
1892 data.size = fip->len;
1893 data.data = fip->data;
1896 B_TSET(bi.type, bip->type);
1897 bi.pgno = bip->pgno;
1898 bi.nrecs = bip->nrecs;
1900 hdr.size = SSZA(BINTERNAL, data);
1901 if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))
1902 nrecs = (int32_t)bip->nrecs;
1904 rk = GET_RINTERNAL(dbp, npg, 0);
1905 size = RINTERNAL_SIZE;
1906 if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
1911 nrecs = (int32_t)rk->nrecs;
1914 * Try to lock the subtree leaf records without waiting.
1915 * We must lock the subtree below the record we are merging
1916 * and the one after it since that is were a search will wind
1917 * up if it has already looked at our parent. After the first
1918 * move we have the current subtree already locked.
1919 * If we merged any records then we will revisit this
1920 * node when we merge its leaves. If not we will return
1921 * NOTGRANTED and our caller will do a retry. We only
1922 * need to do this if we are in a transation. If not then
1923 * we cannot abort and things will be hosed up on error
1926 if (dbc->txn != NULL && (ret = __bam_lock_tree(ndbc,
1927 ncp->csp, nsave_csp, first,
1928 NUM_ENT(ncp->csp->page) == 1 ? 1 : 2)) != 0) {
1929 if (ret != DB_LOCK_NOTGRANTED)
1934 if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
1938 /* reset size to be for the record being deleted. */
1939 size = BINTERNAL_SIZE(bip->len);
1942 if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0)
1946 } while (--indx != 0);
1953 ret = __bam_adjust(dbc, trecs);
1958 if ((ret = __bam_adjust(ndbc, -trecs)) != 0)
1964 * Either we emptied the page or we need to update its
1965 * parent to reflect the first page we now point to.
1966 * First get rid of the bottom of the stack,
1967 * bam_dpages will clear the stack. Maintain transactional
1968 * locks on the leaf pages to protect changes at this level.
1971 if ((ret = __memp_fput(dbmp, dbc->thread_info,
1972 nsave_csp->page, dbc->priority)) != 0)
1974 nsave_csp->page = NULL;
1975 if ((ret = __TLPUT(dbc, nsave_csp->lock)) != 0)
1977 LOCK_INIT(nsave_csp->lock);
1979 } while (nsave_csp != ncp->csp);
1981 if (NUM_ENT(npg) == 0) {
1983 * __bam_dpages may decide to collapse the tree
1984 * so we need to free our other stack. The tree
1985 * will change in hight and our stack will nolonger
1989 cp->sp->page = NULL;
1990 LOCK_INIT(cp->sp->lock);
1991 if (PGNO(ncp->sp->page) == ncp->root &&
1992 NUM_ENT(ncp->sp->page) == 2) {
1993 if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0)
1995 level = LEVEL(ncp->sp->page);
1996 ppgno = PGNO(ncp->csp[-1].page);
2000 if (c_data->compact_truncate > PGNO(npg))
2001 c_data->compact_truncate--;
2002 ret = __bam_dpages(ndbc,
2003 0, ndbc->dbtype == DB_RECNO ?
2004 BTD_RELINK : BTD_UPDATE | BTD_RELINK);
2005 c_data->compact_pages_free++;
2006 if (ret == 0 && level != 0) {
2007 if ((ret = __memp_fget(dbmp, &ncp->root,
2008 dbc->thread_info, dbc->txn, 0, &npg)) != 0)
2010 if (level == LEVEL(npg))
2012 if ((ret = __memp_fput(dbmp,
2013 dbc->thread_info, npg, dbc->priority)) != 0)
2017 c_data->compact_levels++;
2018 c_data->compact_pages_free++;
2019 if (c_data->compact_truncate > ppgno)
2020 c_data->compact_truncate--;
2021 if (c_data->compact_pages != 0)
2022 c_data->compact_pages--;
2026 ret = __bam_pupdate(ndbc, npg);
2028 if (NUM_ENT(npg) != 0 &&
2029 c_data->compact_truncate != PGNO_INVALID &&
2030 PGNO(npg) > c_data->compact_truncate &&
2031 ncp->csp != ncp->sp) {
2032 if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0)
2035 if (c_data->compact_truncate != PGNO_INVALID &&
2036 PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
2037 if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0)
2046 err: cp->csp = save_csp;
2047 ncp->csp = nsave_csp;
2053 * __bam_compact_dups -- try to compress off page dup trees.
2054 * We may or may not have a write lock on this page.
2057 __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
2075 int isdone, level, ret, span, t_ret;
2081 DB_ASSERT(NULL, dbc != NULL);
2085 cp = (BTREE_CURSOR *)dbc->internal;
2088 for (i = 0; i < NUM_ENT(pg); i++) {
2089 bo = GET_BOVERFLOW(dbp, pg, i);
2090 if (B_TYPE(bo->type) == B_KEYDATA)
2092 c_data->compact_pages_examine++;
2093 if (bo->pgno > c_data->compact_truncate) {
2097 * The caller should have the page at
2098 * least read locked. Drop the buffer
2099 * and get the write lock.
2102 if ((ret = __memp_fput(dbmp, dbc->thread_info,
2103 pg, dbc->priority)) != 0)
2106 if ((ret = __db_lget(dbc, 0, pgno,
2107 DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
2110 if ((ret = __memp_fget(dbmp, &pgno,
2112 dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
2117 __bam_truncate_root_page(dbc, pg, i, c_data)) != 0)
2119 /* Just in case it should move. Could it? */
2120 bo = GET_BOVERFLOW(dbp, pg, i);
2123 if (B_TYPE(bo->type) == B_OVERFLOW) {
2124 if ((ret = __bam_truncate_overflow(dbc,
2125 bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
2131 * Take a peek at the root. If it's a leaf then
2132 * there is no tree here, avoid all the trouble.
2134 if ((ret = __memp_fget(dbmp, &bo->pgno,
2135 dbc->thread_info, dbc->txn, 0, &dpg)) != 0)
2139 if ((ret = __memp_fput(dbmp,
2140 dbc->thread_info, dpg, dbc->priority)) != 0)
2142 if (level == LEAFLEVEL)
2144 if ((ret = __dbc_newopd(dbc, bo->pgno, NULL, &opd)) != 0)
2148 * The caller should have the page at
2149 * least read locked. Drop the buffer
2150 * and get the write lock.
2153 if ((ret = __memp_fput(dbmp, dbc->thread_info,
2154 pg, dbc->priority)) != 0)
2157 if ((ret = __db_lget(dbc, 0, pgno,
2158 DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
2161 if ((ret = __memp_fget(dbmp, &pgno,
2163 dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
2168 memset(&start, 0, sizeof(start));
2170 if ((ret = __bam_compact_int(opd, &start,
2171 NULL, factor, &span, c_data, &isdone)) != 0)
2175 if (start.data != NULL)
2176 __os_free(env, start.data);
2181 ret = __dbc_close(opd);
2187 err: if (opd != NULL && (t_ret = __dbc_close(opd)) != 0 && ret == 0)
2193 * __bam_truncate_page -- swap a page with a lower numbered page.
2194 * The cusor has a stack which includes at least the
2195 * immediate parent of this page.
2198 __bam_truncate_page(dbc, pgp, opg, update_parent)
2210 db_pgno_t newpgno, oldpgno, *pgnop;
2213 DB_ASSERT(NULL, dbc != NULL);
2218 * We want to free a page that lives in the part of the file that
2219 * can be truncated, so we're going to move it onto a free page
2220 * that is in the part of the file that need not be truncated.
2221 * Since the freelist is ordered now, we can simply call __db_new
2222 * which will grab the first element off the freelist; we know this
2223 * is the lowest numbered free page.
2225 if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
2226 TYPE(*pgp) == P_LBTREE ? &lock : NULL, &newpage)) != 0)
2230 * If newpage is null then __db_new would have had to allocate
2231 * a new page from the filesystem, so there is no reason
2232 * to continue this action.
2234 if (newpage == NULL)
2238 * It is possible that a higher page is allocated if other threads
2239 * are allocating at the same time, if so, just put it back.
2241 if (PGNO(newpage) > PGNO(*pgp)) {
2242 /* Its unfortunate but you can't just free a new overflow. */
2243 if (TYPE(newpage) == P_OVERFLOW)
2244 OV_LEN(newpage) = 0;
2245 if ((ret = __LPUT(dbc, lock)) != 0)
2247 return (__db_free(dbc, newpage));
2250 /* Log if necessary. */
2251 if (DBC_LOGGING(dbc)) {
2252 memset(&hdr, 0, sizeof(hdr));
2254 hdr.size = P_OVERHEAD(dbp);
2255 memset(&data, 0, sizeof(data));
2256 if (TYPE(*pgp) == P_OVERFLOW) {
2257 data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
2258 data.size = OV_LEN(*pgp);
2260 data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
2261 data.size = dbp->pgsize - HOFFSET(*pgp);
2262 hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
2264 if ((ret = __bam_merge_log(dbp, dbc->txn,
2265 &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
2266 PGNO(*pgp), &LSN(*pgp), &hdr, &data, 1)) != 0)
2269 LSN_NOT_LOGGED(LSN(newpage));
2271 oldpgno = PGNO(*pgp);
2272 newpgno = PGNO(newpage);
2274 memcpy(newpage, *pgp, dbp->pgsize);
2275 PGNO(newpage) = newpgno;
2278 /* Empty the old page. */
2279 if ((ret = __memp_dirty(dbp->mpf,
2280 pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
2282 if (TYPE(*pgp) == P_OVERFLOW)
2285 HOFFSET(*pgp) = dbp->pgsize;
2290 /* Update siblings. */
2291 switch (TYPE(newpage)) {
2296 if (NEXT_PGNO(newpage) == PGNO_INVALID &&
2297 PREV_PGNO(newpage) == PGNO_INVALID)
2299 if ((ret = __bam_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
2305 cp = (BTREE_CURSOR *)dbc->internal;
2308 * Now, if we free this page, it will get truncated, when we free
2309 * all the pages after it in the file.
2311 ret = __db_free(dbc, *pgp);
2312 /* db_free always puts the page. */
2321 /* Update the parent. */
2324 switch (TYPE(epg->page)) {
2326 pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
2329 pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
2332 pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
2335 DB_ASSERT(dbp->env, oldpgno == *pgnop);
2336 if (DBC_LOGGING(dbc)) {
2337 if ((ret = __bam_pgno_log(dbp, dbc->txn, &LSN(epg->page),
2338 0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
2339 *pgnop, PGNO(newpage))) != 0)
2342 LSN_NOT_LOGGED(LSN(epg->page));
2344 *pgnop = PGNO(newpage);
2345 cp->csp->page = newpage;
2346 if ((ret = __TLPUT(dbc, lock)) != 0)
2351 err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
2352 (void)__TLPUT(dbc, lock);
2357 * __bam_truncate_overflow -- find overflow pages to truncate.
2358 * Walk the pages of an overflow chain and swap out
2359 * high numbered pages. We are passed the first page
2360 * but only deal with the second and subsequent pages.
2364 __bam_truncate_overflow(dbc, pgno, ppg, c_data)
2374 int have_lock, ret, t_ret;
2379 have_lock = ppg == NULL;
2381 if ((ret = __memp_fget(dbp->mpf, &pgno,
2382 dbc->thread_info, dbc->txn, 0, &page)) != 0)
2385 while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
2386 if ((ret = __memp_fput(dbp->mpf,
2387 dbc->thread_info, page, dbc->priority)) != 0)
2389 if ((ret = __memp_fget(dbp->mpf, &pgno,
2390 dbc->thread_info, dbc->txn, 0, &page)) != 0)
2392 if (pgno <= c_data->compact_truncate)
2394 if (have_lock == 0) {
2396 if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
2397 *ppg, dbc->priority)) != 0)
2400 if ((ret = __db_lget(dbc, 0, ppgno,
2401 DB_LOCK_WRITE, 0, &lock)) != 0)
2403 if ((ret = __memp_fget(dbp->mpf, &ppgno,
2405 dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
2409 if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0)
2413 err: if (page != NULL &&
2414 (t_ret = __memp_fput( dbp->mpf,
2415 dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
2417 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
2423 * __bam_truncate_root_page -- swap a page which is
2424 * the root of an off page dup tree or the head of an overflow.
2425 * The page is reference by the pg/indx passed in.
2428 __bam_truncate_root_page(dbc, pg, indx, c_data)
2439 db_pgno_t newpgno, *pgnop;
2442 COMPQUIET(c_data, NULL);
2443 COMPQUIET(bo, NULL);
2444 COMPQUIET(newpgno, PGNO_INVALID);
2447 if (TYPE(pg) == P_IBTREE) {
2448 bi = GET_BINTERNAL(dbp, pg, indx);
2449 if (B_TYPE(bi->type) == B_OVERFLOW) {
2450 bo = (BOVERFLOW *)(bi->data);
2455 bo = GET_BOVERFLOW(dbp, pg, indx);
2459 DB_ASSERT(dbp->env, IS_DIRTY(pg));
2461 if ((ret = __memp_fget(dbp->mpf, pgnop,
2462 dbc->thread_info, dbc->txn, 0, &page)) != 0)
2466 * If this is a multiply reference overflow key, then we will just
2467 * copy it and decrement the reference count. This is part of a
2468 * fix to get rid of multiple references.
2470 if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
2471 if ((ret = __db_ovref(dbc, bo->pgno)) != 0)
2473 memset(&orig, 0, sizeof(orig));
2474 if ((ret = __db_goff(dbc, &orig, bo->tlen, bo->pgno,
2475 &orig.data, &orig.size)) == 0)
2476 ret = __db_poff(dbc, &orig, &newpgno);
2477 if (orig.data != NULL)
2478 __os_free(dbp->env, orig.data);
2482 if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0)
2484 newpgno = PGNO(page);
2485 /* If we could not allocate from the free list, give up.*/
2486 if (newpgno == *pgnop)
2490 /* Update the reference. */
2491 if (DBC_LOGGING(dbc)) {
2492 if ((ret = __bam_pgno_log(dbp,
2493 dbc->txn, &LSN(pg), 0, PGNO(pg),
2494 &LSN(pg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
2497 LSN_NOT_LOGGED(LSN(pg));
2501 err: if (page != NULL && (t_ret =
2502 __memp_fput(dbp->mpf, dbc->thread_info,
2503 page, dbc->priority)) != 0 && ret == 0)
2509 * -- bam_truncate_internal_overflow -- find overflow keys
2510 * on internal pages and if they have high page
2511 * numbers swap them with lower pages and truncate them.
2512 * Note that if there are overflow keys in the internal
2513 * nodes they will get copied adding pages to the database.
2516 __bam_truncate_internal_overflow(dbc, page, c_data)
2526 COMPQUIET(bo, NULL);
2528 for (indx = 0; indx < NUM_ENT(page); indx++) {
2529 bi = GET_BINTERNAL(dbc->dbp, page, indx);
2530 if (B_TYPE(bi->type) != B_OVERFLOW)
2532 bo = (BOVERFLOW *)(bi->data);
2533 if (bo->pgno > c_data->compact_truncate && (ret =
2534 __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
2536 if ((ret = __bam_truncate_overflow(
2537 dbc, bo->pgno, NULL, c_data)) != 0)
2544 * __bam_compact_isdone ---
2546 * Check to see if the stop key specified by the caller is on the
2547 * current page, in which case we are done compacting.
2550 __bam_compact_isdone(dbc, stop, pg, isdone)
2562 cp = (BTREE_CURSOR *)dbc->internal;
2563 t = dbc->dbp->bt_internal;
2565 if (dbc->dbtype == DB_RECNO) {
2566 if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0)
2568 *isdone = cp->recno > recno;
2570 DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
2571 if ((ret = __bam_cmp(dbc, stop, pg, 0,
2572 t->bt_compare, &cmp)) != 0)
2581 * Lock the subtrees from the top of the stack.
2582 * The 0'th child may be in the stack and locked otherwise iterate
2583 * through the records by calling __bam_lock_subtree.
2586 __bam_lock_tree(dbc, sp, csp, start, stop)
2589 u_int32_t start, stop;
2595 if (dbc->dbtype == DB_RECNO)
2596 pgno = GET_RINTERNAL(dbc->dbp, sp->page, 0)->pgno;
2598 pgno = GET_BINTERNAL(dbc->dbp, sp->page, 0)->pgno;
2599 cpage = (sp + 1)->page;
2601 * First recurse down the left most sub tree if it is in the cursor
2602 * stack. We already have these pages latched and locked if its a
2605 if (start == 0 && sp + 1 != csp && pgno == PGNO(cpage) &&
2606 (ret = __bam_lock_tree(dbc, sp + 1, csp, 0, NUM_ENT(cpage))) != 0)
2610 * Then recurse on the other records on the page if needed.
2611 * If the page is in the stack then its already locked or
2612 * was processed above.
2614 if (start == 0 && pgno == PGNO(cpage))
2619 return (__bam_lock_subtree(dbc, sp->page, start, stop));
2624 * Lock the subtree from the current node.
2627 __bam_lock_subtree(dbc, page, indx, stop)
2630 u_int32_t indx, stop;
2640 for (; indx < stop; indx++) {
2641 if (dbc->dbtype == DB_RECNO)
2642 pgno = GET_RINTERNAL(dbc->dbp, page, indx)->pgno;
2644 pgno = GET_BINTERNAL(dbc->dbp, page, indx)->pgno;
2645 if (LEVEL(page) - 1 == LEAFLEVEL) {
2646 if ((ret = __db_lget(dbc, 0, pgno,
2647 DB_LOCK_WRITE, DB_LOCK_NOWAIT, &lock)) != 0) {
2648 if (ret == DB_LOCK_DEADLOCK)
2649 return (DB_LOCK_NOTGRANTED);
2653 if ((ret = __memp_fget(dbp->mpf, &pgno,
2654 dbc->thread_info, dbc->txn, 0, &cpage)) != 0)
2656 ret = __bam_lock_subtree(dbc, cpage, 0, NUM_ENT(cpage));
2657 if ((t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
2658 cpage, dbc->priority)) != 0 && ret == 0)
2667 #ifdef HAVE_FTRUNCATE
2669 * __bam_savekey -- save the key from an internal page.
2670 * We need to save information so that we can
2671 * fetch then next internal node of the tree. This means
2672 * we need the btree key on this current page, or the
2673 * next record number.
2676 __bam_savekey(dbc, next, start)
2690 db_indx_t indx, top;
2691 db_pgno_t pgno, saved_pgno;
2699 cp = (BTREE_CURSOR *)dbc->internal;
2703 if (dbc->dbtype == DB_RECNO) {
2705 for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) {
2706 ri = GET_RINTERNAL(dbp, pg, indx);
2707 cp->recno += ri->nrecs;
2709 return (__db_retcopy(env, start, &cp->recno,
2710 sizeof(cp->recno), &start->data, &start->ulen));
2714 bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
2718 saved_pgno = PGNO_INVALID;
2719 /* If there is single record on the page it may have an empty key. */
2722 * We should not have an empty data page, since we just
2723 * compacted things, check anyway and punt.
2725 if (NUM_ENT(pg) == 0)
2729 if (pg != cp->csp->page &&
2730 (ret = __memp_fput(dbp->mpf,
2731 dbc->thread_info, pg, dbc->priority)) != 0) {
2735 if (level - 1 == LEAFLEVEL) {
2736 TRY_LOCK(dbc, pgno, saved_pgno,
2737 lock, DB_LOCK_READ, retry);
2741 if ((ret = __memp_fget(dbp->mpf, &pgno,
2742 dbc->thread_info, dbc->txn, 0, &pg)) != 0)
2746 * At the data level use the last key to try and avoid the
2747 * possibility that the user has a zero length key, if they
2750 if (pg->level == LEAFLEVEL) {
2751 bk = GET_BKEYDATA(dbp, pg, NUM_ENT(pg) - 2);
2755 no_key: __db_errx(env,
2756 "Compact cannot handle zero length key");
2761 bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
2766 if (B_TYPE(bi->type) == B_OVERFLOW) {
2767 bo = (BOVERFLOW *)(data);
2768 ret = __db_goff(dbc, start, bo->tlen, bo->pgno,
2769 &start->data, &start->ulen);
2772 ret = __db_retcopy(env,
2773 start, data, len, &start->data, &start->ulen);
2775 err: if (pg != NULL && pg != cp->csp->page &&
2776 (t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
2777 pg, dbc->priority)) != 0 && ret == 0)
2781 retry: return (DB_LOCK_NOTGRANTED);
2785 * bam_truncate_internal --
2786 * Find high numbered pages in the internal nodes of a tree and
2790 __bam_truncate_internal(dbp, ip, txn, c_data)
2803 int level, local_txn, ret, t_ret;
2806 memset(&start, 0, sizeof(start));
2808 if (IS_DB_AUTO_COMMIT(dbp, txn)) {
2814 level = LEAFLEVEL + 1;
2815 sflag = CS_READ | CS_GETRECNO;
2816 LOCK_INIT(meta_lock);
2820 (ret = __txn_begin(dbp->env, ip, NULL, &txn, 0)) != 0)
2823 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
2825 cp = (BTREE_CURSOR *)dbc->internal;
2828 * If the the root is a leaf we have nothing to do.
2829 * Searching an empty RECNO tree will return NOTFOUND below and loop.
2831 if ((ret = __memp_fget(dbp->mpf, &cp->root, ip, txn, 0, &pg)) != 0)
2833 if (LEVEL(pg) == LEAFLEVEL) {
2834 ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority);
2837 if ((ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority)) != 0)
2840 pgno = PGNO_INVALID;
2842 if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) {
2843 /* No more at this level, go up one. */
2844 if (ret == DB_NOTFOUND) {
2846 if (start.data != NULL)
2847 __os_free(dbp->env, start.data);
2848 memset(&start, 0, sizeof(start));
2849 sflag = CS_READ | CS_GETRECNO;
2854 c_data->compact_pages_examine++;
2859 sflag = CS_NEXT | CS_GETRECNO;
2860 /* Grab info about the page and drop the stack. */
2861 if (pgno != cp->root && (ret = __bam_savekey(dbc,
2862 pgno <= c_data->compact_truncate, &start)) != 0) {
2863 if (ret == DB_LOCK_NOTGRANTED)
2868 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
2870 if (pgno == cp->root)
2873 if (pgno <= c_data->compact_truncate)
2876 /* Get the meta page lock before latching interior nodes. */
2877 if (!LOCK_ISSET(meta_lock) && (ret = __db_lget(dbc,
2878 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
2881 /* Reget the page with a write latch, and its parent too. */
2882 if ((ret = __bam_csearch(dbc,
2883 &start, CS_PARENT | CS_GETRECNO, level)) != 0) {
2884 if (ret == DB_NOTFOUND) {
2892 if (pgno > c_data->compact_truncate) {
2893 if ((ret = __bam_truncate_page(dbc, &pg, NULL, 1)) != 0)
2895 if (pgno == PGNO(pg)) {
2896 /* We could not allocate. Give up. */
2901 if ((ret = __bam_stkrel(dbc,
2902 pgno > c_data->compact_truncate ? 0 : STK_NOLOCK)) != 0)
2905 /* We are locking subtrees, so drop the write locks asap. */
2906 if (local_txn && pgno > c_data->compact_truncate)
2908 } while (pgno != cp->root);
2910 if ((ret = __dbc_close(dbc)) != 0)
2914 if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
2917 LOCK_INIT(meta_lock);
2919 if (pgno != ((BTREE *)dbp->bt_internal)->bt_root)
2922 err: if (txn != NULL && ret != 0)
2927 if ((t_ret = __LPUT(dbc, meta_lock)) != 0 && ret == 0)
2929 if (dbc != NULL && (t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
2931 if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
2934 txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
2936 if (start.data != NULL)
2937 __os_free(dbp->env, start.data);
2942 __bam_setup_freelist(dbp, list, nelems)
2953 if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
2956 while (nelems-- != 0)
2957 *plist++ = list++->pgno;
2963 __bam_free_freelist(dbp, ip, txn)
2970 int auto_commit, ret, t_ret;
2973 auto_commit = ret = 0;
2976 * If we are not in a transaction then we need to get
2977 * a lock on the meta page, otherwise we should already
2982 if (IS_DB_AUTO_COMMIT(dbp, txn)) {
2984 * We must not timeout the lock or we will not free the list.
2985 * We ignore errors from txn_begin as there is little that
2986 * the application can do with the error and we want to
2987 * get the lock and free the list if at all possible.
2989 if (__txn_begin(dbp->env, ip, NULL, &txn, 0) == 0) {
2990 (void)__lock_set_timeout(dbp->env,
2991 txn->locker, 0, DB_SET_TXN_TIMEOUT);
2992 (void)__lock_set_timeout(dbp->env,
2993 txn->locker, 0, DB_SET_LOCK_TIMEOUT);
2996 /* Get a cursor so we can call __db_lget. */
2997 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
3000 if ((ret = __db_lget(dbc,
3001 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
3005 ret = __memp_free_freelist(dbp->mpf);
3007 err: if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
3010 if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
3013 if (auto_commit && __txn_abort(txn) != 0 && ret == 0)