2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996-2009 Oracle. All rights reserved.
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 * Keith Bostic. All rights reserved.
11 * Copyright (c) 1990, 1993
12 * The Regents of the University of California. All rights reserved.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 #include "db_config.h"
44 #include "dbinc/db_page.h"
45 #include "dbinc/btree.h"
46 #include "dbinc/lock.h"
51 * Search a btree for a record number.
53 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
56 __bam_rsearch(dbc, recnop, flags, stop, exactp)
70 db_indx_t adjust, deloffset, indx, top;
71 db_lockmode_t lock_mode;
73 db_recno_t recno, t_recno, total;
75 int ret, stack, t_ret;
80 cp = (BTREE_CURSOR *)dbc->internal;
86 * There are several ways we search a btree tree. The flags argument
87 * specifies if we're acquiring read or write locks and if we are
88 * locking pairs of pages. In addition, if we're adding or deleting
89 * an item, we have to lock the entire tree, regardless. See btree.h
92 * If write-locking pages, we need to know whether or not to acquire a
93 * write lock on a page before getting it. This depends on how deep it
94 * is in tree, which we don't know until we acquire the root page. So,
95 * if we need to lock the root page we may have to upgrade it later,
96 * because we won't get the correct lock initially.
98 * Retrieve the root page.
101 if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0)
103 lock_mode = cp->csp->lock_mode;
104 get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
105 lock = cp->csp->lock;
110 * If appending to the tree, set the record number now -- we have the
113 * Delete only deletes exact matches, read only returns exact matches.
114 * Note, this is different from __bam_search(), which returns non-exact
117 * The record may not exist. We can only return the correct location
118 * for the record immediately after the last record in the tree, so do
122 if (LF_ISSET(SR_APPEND)) {
124 *recnop = recno = total + 1;
131 if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
133 * Keep the page locked for serializability.
136 * This leaves the root page locked, which will
137 * eliminate any concurrency. A possible fix
138 * would be to lock the last leaf page instead.
140 ret = __memp_fput(mpf,
141 dbc->thread_info, h, dbc->priority);
143 __TLPUT(dbc, lock)) != 0 && ret == 0)
145 return (ret == 0 ? DB_NOTFOUND : ret);
152 * Record numbers in the tree are 0-based, but the recno is
153 * 1-based. All of the calculations below have to take this
159 if (LF_ISSET(SR_MAX)) {
160 indx = NUM_ENT(h) - 2;
165 if (LF_ISSET(SR_MAX)) {
166 indx = NUM_ENT(h) - 1;
171 * There may be logically deleted records on the page.
172 * If there are enough, the record may not exist.
174 if (TYPE(h) == P_LBTREE) {
181 for (t_recno = 0, indx = 0;; indx += adjust) {
182 if (indx >= NUM_ENT(h)) {
184 if (!LF_ISSET(SR_PAST_EOF) ||
185 recno > t_recno + 1) {
186 ret = __memp_fput(mpf,
190 if ((t_ret = __TLPUT(dbc,
191 lock)) != 0 && ret == 0)
198 if (!B_DISSET(GET_BKEYDATA(dbp, h,
199 indx + deloffset)->type) &&
204 BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
207 if (LF_ISSET(SR_BOTH))
211 if (LF_ISSET(SR_MAX)) {
213 bi = GET_BINTERNAL(dbp, h, indx - 1);
214 } else for (indx = 0, top = NUM_ENT(h);;) {
215 bi = GET_BINTERNAL(dbp, h, indx);
216 if (++indx == top || total + bi->nrecs >= recno)
223 if (LF_ISSET(SR_MAX))
228 /* Correct from 1-based to 0-based for a page offset. */
230 enter: BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
233 if (LF_ISSET(SR_BOTH)) {
234 get_prev: DB_ASSERT(env, LF_ISSET(SR_NEXT));
236 * We have a NEXT tree, now add the sub tree
237 * that points gets to the previous page.
240 indx = cp->sp->indx - 1;
242 if (TYPE(h) == P_IRECNO) {
243 ri = GET_RINTERNAL(dbp, h, indx);
246 DB_ASSERT(env, TYPE(h) == P_IBTREE);
247 bi = GET_BINTERNAL(dbp, h, indx);
250 LF_CLR(SR_NEXT | SR_BOTH);
258 if (LF_ISSET(SR_MAX)) {
260 ri = GET_RINTERNAL(dbp, h, indx - 1);
261 } else for (indx = 0, top = NUM_ENT(h);;) {
262 ri = GET_RINTERNAL(dbp, h, indx);
263 if (++indx == top || total + ri->nrecs >= recno)
270 return (__db_pgfmt(env, h->pgno));
274 /* Return if this is the lowest page wanted. */
275 if (stop == LEVEL(h)) {
276 BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
282 BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
287 lock_mode = DB_LOCK_WRITE;
288 get_mode = DB_MPOOL_DIRTY;
290 __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
292 } else if (LF_ISSET(SR_NEXT)) {
294 * For RECNO if we are doing a NEXT search the
295 * search recno is the one we are looking for
296 * but we want to keep the stack from the spanning
297 * node on down. We only know we have the spanning
298 * node when its child's index is 0, so save
299 * each node and discard the tree when we find out
302 if (indx != 0 && cp->sp->page != NULL) {
304 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
308 BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
312 lock_next: if ((ret =
313 __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
317 * Decide if we want to return a pointer to the next
318 * page in the stack. If we do, write lock it and
321 if ((LF_ISSET(SR_PARENT) &&
322 (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
323 (LEVEL(h) - 1) == LEAFLEVEL)
326 if ((ret = __memp_fput(mpf,
327 dbc->thread_info, h, dbc->priority)) != 0)
332 LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
333 if (lock_mode == DB_LOCK_WRITE)
334 get_mode = DB_MPOOL_DIRTY;
335 if ((ret = __db_lget(dbc,
336 LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
338 * If we fail, discard the lock we held. This
339 * is OK because this only happens when we are
340 * descending the tree holding read-locks.
342 (void)__LPUT(dbc, lock);
347 if ((ret = __memp_fget(mpf, &pg,
348 dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
353 err: if (h != NULL && (t_ret = __memp_fput(mpf,
354 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
358 (void)__bam_stkrel(dbc, 0);
365 * Adjust the tree after adding or deleting a record.
367 * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
370 __bam_adjust(dbc, adjust)
384 cp = (BTREE_CURSOR *)dbc->internal;
385 root_pgno = cp->root;
387 /* Update the record counts for the tree. */
388 for (epg = cp->sp; epg <= cp->csp; ++epg) {
390 if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
391 ret = __memp_dirty(mpf, &h,
392 dbc->thread_info, dbc->txn, dbc->priority, 0);
396 if (DBC_LOGGING(dbc)) {
397 if ((ret = __bam_cadjust_log(dbp, dbc->txn,
398 &LSN(h), 0, PGNO(h), &LSN(h),
399 (u_int32_t)epg->indx, adjust,
400 PGNO(h) == root_pgno ?
401 CAD_UPDATEROOT : 0)) != 0)
404 LSN_NOT_LOGGED(LSN(h));
406 if (TYPE(h) == P_IBTREE)
407 GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
410 GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
413 if (PGNO(h) == root_pgno)
414 RE_NREC_ADJ(h, adjust);
422 * Return the number of records in the tree.
424 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
427 __bam_nrecs(dbc, rep)
441 pgno = dbc->internal->root;
442 if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
444 if ((ret = __memp_fget(mpf, &pgno,
445 dbc->thread_info, dbc->txn, 0, &h)) != 0)
450 ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
451 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
459 * Return the number of records below a page.
461 * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
476 /* Check for logically deleted records. */
477 for (indx = 0; indx < top; indx += P_INDX)
479 GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
483 /* Check for logically deleted records. */
484 for (indx = 0; indx < top; indx += O_INDX)
485 if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
489 for (indx = 0; indx < top; indx += O_INDX)
490 nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
496 for (indx = 0; indx < top; indx += O_INDX)
497 nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;