2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996-2009 Oracle. All rights reserved.
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 * Keith Bostic. All rights reserved.
11 * Copyright (c) 1990, 1993, 1994, 1995
12 * The Regents of the University of California. All rights reserved.
14 * This code is derived from software contributed to Berkeley by
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 #include "db_config.h"
47 #include "dbinc/db_page.h"
48 #include "dbinc/btree.h"
49 #include "dbinc/lock.h"
54 * Delete one or more entries from a page.
56 * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
59 __bam_ditem(dbc, h, indx)
74 /* The page should already have been dirtied by our caller. */
75 DB_ASSERT(dbp->env, IS_DIRTY(h));
79 bi = GET_BINTERNAL(dbp, h, indx);
80 switch (B_TYPE(bi->type)) {
83 nbytes = BINTERNAL_SIZE(bi->len);
86 nbytes = BINTERNAL_SIZE(bi->len);
88 __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
92 return (__db_pgfmt(dbp->env, PGNO(h)));
96 nbytes = RINTERNAL_SIZE;
100 * If it's a duplicate key, discard the index and don't touch
101 * the actual page item.
104 * This works because no data item can have an index matching
105 * any other index so even if the data item is in a key "slot",
106 * it won't match any other index.
108 if ((indx % 2) == 0) {
110 * Check for a duplicate after us on the page. NOTE:
111 * we have to delete the key item before deleting the
112 * data item, otherwise the "indx + P_INDX" calculation
115 if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
116 inp[indx] == inp[indx + P_INDX])
117 return (__bam_adjindx(dbc,
118 h, indx, indx + O_INDX, 0));
120 * Check for a duplicate before us on the page. It
121 * doesn't matter if we delete the key item before or
122 * after the data item for the purposes of this one.
124 if (indx > 0 && inp[indx] == inp[indx - P_INDX])
125 return (__bam_adjindx(dbc,
126 h, indx, indx - P_INDX, 0));
131 bk = GET_BKEYDATA(dbp, h, indx);
132 switch (B_TYPE(bk->type)) {
134 nbytes = BOVERFLOW_SIZE;
137 nbytes = BOVERFLOW_SIZE;
138 if ((ret = __db_doff(
139 dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
143 nbytes = BKEYDATA_SIZE(bk->len);
146 return (__db_pgfmt(dbp->env, PGNO(h)));
150 return (__db_pgfmt(dbp->env, PGNO(h)));
153 /* Delete the item and mark the page dirty. */
154 if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
162 * Adjust an index on the page.
164 * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
167 __bam_adjindx(dbc, h, indx, indx_copy, is_insert)
170 u_int32_t indx, indx_copy;
174 db_indx_t copy, *inp;
180 /* Log the change. */
181 if (DBC_LOGGING(dbc)) {
182 if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
183 PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
186 LSN_NOT_LOGGED(LSN(h));
188 /* Shuffle the indices and mark the page dirty. */
190 copy = inp[indx_copy];
191 if (indx != NUM_ENT(h))
192 memmove(&inp[indx + O_INDX], &inp[indx],
193 sizeof(db_indx_t) * (NUM_ENT(h) - indx));
198 if (indx != NUM_ENT(h))
199 memmove(&inp[indx], &inp[indx + O_INDX],
200 sizeof(db_indx_t) * (NUM_ENT(h) - indx));
208 * Delete a set of locked pages.
210 * PUBLIC: int __bam_dpages __P((DBC *, int, int));
213 __bam_dpages(dbc, use_top, flags)
222 DB_LOCK c_lock, p_lock;
224 EPG *epg, *save_sp, *stack_epg;
225 PAGE *child, *parent;
227 db_pgno_t pgno, root_pgno;
229 int done, ret, t_ret;
233 cp = (BTREE_CURSOR *)dbc->internal;
238 * We have the entire stack of deletable pages locked.
240 * Btree calls us with the first page in the stack is to have a
241 * single item deleted, and the rest of the pages are to be removed.
243 * Recno always has a stack to the root and __bam_merge operations
244 * may have unneeded items in the sack. We find the lowest page
245 * in the stack that has more than one record in it and start there.
251 for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg)
252 if (NUM_ENT(stack_epg->page) > 1)
257 * There is an interesting deadlock situation here. We have to relink
258 * the leaf page chain around the leaf page being deleted. Consider
259 * a cursor walking through the leaf pages, that has the previous page
260 * read-locked and is waiting on a lock for the page we're deleting.
261 * It will deadlock here. Before we unlink the subtree, we relink the
264 if (LF_ISSET(BTD_RELINK) && LEVEL(cp->csp->page) == 1 &&
265 (ret = __bam_relink(dbc, cp->csp->page, NULL, PGNO_INVALID)) != 0)
269 * Delete the last item that references the underlying pages that are
270 * to be deleted, and adjust cursors that reference that page. Then,
271 * save that page's page number and item count and release it. If
272 * the application isn't retaining locks because it's running without
273 * transactions, this lets the rest of the tree get back to business
276 if ((ret = __memp_dirty(mpf,
277 &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
279 if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
281 if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
284 if (LF_ISSET(BTD_UPDATE) && epg->indx == 0) {
287 ret = __bam_pupdate(dbc, epg->page);
293 pgno = PGNO(epg->page);
294 nitems = NUM_ENT(epg->page);
296 ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority);
298 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
303 /* Then, discard any pages that we don't care about. */
304 discard: for (epg = cp->sp; epg < stack_epg; ++epg) {
305 if ((t_ret = __memp_fput(mpf, dbc->thread_info,
306 epg->page, dbc->priority)) != 0 && ret == 0)
309 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
315 /* Free the rest of the pages in the stack. */
316 while (++epg <= cp->csp) {
317 if ((ret = __memp_dirty(mpf, &epg->page,
318 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
321 * Delete page entries so they will be restored as part of
322 * recovery. We don't need to do cursor adjustment here as
323 * the pages are being emptied by definition and so cannot
324 * be referenced by a cursor.
326 if (NUM_ENT(epg->page) != 0) {
327 DB_ASSERT(dbp->env, LEVEL(epg->page) != 1);
329 if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
332 * Sheer paranoia: if we find any pages that aren't
333 * emptied by the delete, someone else added an item
334 * while we were walking the tree, and we discontinue
335 * the delete. Shouldn't be possible, but we check
338 if (NUM_ENT(epg->page) != 0)
342 ret = __db_free(dbc, epg->page);
343 if (cp->page == epg->page)
346 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
354 err: for (; epg <= cp->csp; ++epg) {
355 if (epg->page != NULL) {
356 (void)__memp_fput(mpf, dbc->thread_info,
357 epg->page, dbc->priority);
360 (void)__TLPUT(dbc, epg->lock);
368 * If we just deleted the next-to-last item from the root page, the
369 * tree can collapse one or more levels. While there remains only a
370 * single item on the root page, write lock the last page referenced
371 * by the root page and copy it over the root page.
373 root_pgno = cp->root;
374 if (pgno != root_pgno || nitems != 1)
377 for (done = 0; !done;) {
379 parent = child = NULL;
386 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
388 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
389 DB_MPOOL_DIRTY, &parent)) != 0)
392 if (NUM_ENT(parent) != 1)
395 switch (TYPE(parent)) {
398 * If this is overflow, then try to delete it.
399 * The child may or may not still point at it.
401 bi = GET_BINTERNAL(dbp, parent, 0);
402 if (B_TYPE(bi->type) == B_OVERFLOW)
403 if ((ret = __db_doff(dbc,
404 ((BOVERFLOW *)bi->data)->pgno)) != 0)
409 pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
415 /* Lock the child page. */
417 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
419 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
420 DB_MPOOL_DIRTY, &child)) != 0)
423 /* Log the change. */
424 if (DBC_LOGGING(dbc)) {
425 memset(&a, 0, sizeof(a));
427 a.size = dbp->pgsize;
428 memset(&b, 0, sizeof(b));
429 b.data = P_ENTRY(dbp, parent, 0);
430 b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
431 BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
432 if ((ret = __bam_rsplit_log(dbp, dbc->txn,
433 &child->lsn, 0, PGNO(child), &a, PGNO(parent),
434 RE_NREC(parent), &b, &parent->lsn)) != 0)
437 LSN_NOT_LOGGED(child->lsn);
442 * One fixup -- internal pages below the top level do not store
443 * a record count, so we have to preserve it if we're not
444 * converting to a leaf page. Note also that we are about to
445 * overwrite the parent page, including its LSN. This is OK
446 * because the log message we wrote describing this update
447 * stores its LSN on the child page. When the child is copied
448 * onto the parent, the correct LSN is copied into place.
451 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
452 rcnt = RE_NREC(parent);
453 memcpy(parent, child, dbp->pgsize);
454 PGNO(parent) = root_pgno;
455 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
456 RE_NREC_SET(parent, rcnt);
458 /* Adjust the cursors. */
459 if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
463 * Free the page copied onto the root page and discard its
464 * lock. (The call to __db_free() discards our reference
467 if ((ret = __db_free(dbc, child)) != 0) {
476 if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0)
478 if (parent != NULL &&
479 (t_ret = __memp_fput(mpf, dbc->thread_info,
480 parent, dbc->priority)) != 0 && ret == 0)
482 if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0)
485 (t_ret = __memp_fput(mpf, dbc->thread_info,
486 child, dbc->priority)) != 0 && ret == 0)
495 * Relink around a deleted page.
497 * PUBLIC: int __bam_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
498 * Otherp can be either the previous or the next page to use if
499 * the caller already holds that page.
502 __bam_relink(dbc, pagep, otherp, new_pgno)
504 PAGE *pagep, *otherp;
509 DB_LSN *nlsnp, *plsnp, ret_lsn;
518 nlsnp = plsnp = NULL;
523 * Retrieve the one/two pages. The caller must have them locked
524 * because the parent is latched. For a remove, we may need
525 * two pages (the before and after). For an add, we only need one
526 * because, the split took care of the prev.
528 if (pagep->next_pgno != PGNO_INVALID) {
529 if (((np = otherp) == NULL ||
530 PGNO(otherp) != pagep->next_pgno) &&
531 (ret = __memp_fget(mpf, &pagep->next_pgno,
532 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
533 ret = __db_pgerr(dbp, pagep->next_pgno, ret);
538 if (pagep->prev_pgno != PGNO_INVALID) {
539 if (((pp = otherp) == NULL ||
540 PGNO(otherp) != pagep->prev_pgno) &&
541 (ret = __memp_fget(mpf, &pagep->prev_pgno,
542 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
543 ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
549 /* Log the change. */
550 if (DBC_LOGGING(dbc)) {
551 if ((ret = __bam_relink_log(dbp, dbc->txn, &ret_lsn, 0,
552 pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
553 pagep->next_pgno, nlsnp)) != 0)
556 LSN_NOT_LOGGED(ret_lsn);
563 * Modify and release the two pages.
566 if (new_pgno == PGNO_INVALID)
567 np->prev_pgno = pagep->prev_pgno;
569 np->prev_pgno = new_pgno;
571 ret = __memp_fput(mpf,
572 dbc->thread_info, np, dbc->priority);
573 if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
580 if (new_pgno == PGNO_INVALID)
581 pp->next_pgno = pagep->next_pgno;
583 pp->next_pgno = new_pgno;
585 ret = __memp_fput(mpf,
586 dbc->thread_info, pp, dbc->priority);
587 if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
594 err: if (np != NULL && np != otherp)
595 (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
596 if (pp != NULL && pp != otherp)
597 (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
603 * Update parent key pointers up the tree.
605 * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
608 __bam_pupdate(dbc, lpg)
618 cp = (BTREE_CURSOR *)dbc->internal;
622 * Update the parents up the tree. __bam_pinsert only looks at the
623 * left child if is a leaf page, so we don't need to change it. We
624 * just do a delete and insert; a replace is possible but reusing
627 for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) {
628 if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page,
629 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
632 if ((ret = __bam_pinsert(dbc, epg, 0,
633 lpg, epg[1].page, BPI_NORECNUM | BPI_REPLACE)) != 0) {
634 if (ret == DB_NEEDSPLIT) {
635 /* This should not happen. */
637 "Not enough room in parent: %s: page %lu",
638 dbc->dbp->fname, (u_long)PGNO(epg->page));
639 ret = __env_panic(env, EINVAL);