2 * Copyright (C) 2002, 2016 by the Massachusetts Institute of Technology.
5 * Export of this software from the United States of America may
6 * require a specific license from the United States Government.
7 * It is the responsibility of any person or organization contemplating
8 * export to obtain such a license before exporting.
10 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
11 * distribute this software and its documentation for any purpose and
12 * without fee is hereby granted, provided that the above copyright
13 * notice appear in all copies and that both that copyright notice and
14 * this permission notice appear in supporting documentation, and that
15 * the name of M.I.T. not be used in advertising or publicity pertaining
16 * to distribution of the software without specific, written prior
17 * permission. Furthermore if you modify this software you must label
18 * your software as modified software and not distribute it in such a
19 * fashion that it might be confused with the original M.I.T. software.
20 * M.I.T. makes no representations about the suitability of
21 * this software for any purpose. It is provided "as is" without express
22 * or implied warranty.
26 * Copyright (c) 1990, 1993, 1994
27 * The Regents of the University of California. All rights reserved.
29 * This code is derived from software contributed to Berkeley by
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 #if defined(LIBC_SCCS) && !defined(lint)
62 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95";
63 #endif /* LIBC_SCCS and not lint */
65 #include <sys/types.h>
76 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
77 static int __bt_seqadv __P((BTREE *, EPG *, int));
78 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
80 static int bt_rseq_next(BTREE *, EPG *);
81 static int bt_rseq_prev(BTREE *, EPG *);
84 * Sequential scan support.
86 * The tree can be scanned sequentially, starting from either end of the
87 * tree or from any specific key. A scan request before any scanning is
88 * done is initialized as starting from the least node.
93 * Btree sequential scan interface.
96 * dbp: pointer to access method
97 * key: key for positioning and return value
98 * data: data return value
99 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
102 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
105 __bt_seq(dbp, key, data, flags)
116 /* Toss any page pinned across calls. */
117 if (t->bt_pinned != NULL) {
118 mpool_put(t->bt_mp, t->bt_pinned, 0);
123 * If scan unitialized as yet, or starting at a specific record, set
124 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
125 * the page the cursor references if they're successful.
132 if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
133 status = __bt_seqadv(t, &e, flags);
140 status = __bt_seqset(t, &e, key, flags);
147 if (status == RET_SUCCESS) {
148 __bt_setcur(t, e.page->pgno, e.index);
151 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
154 * If the user is doing concurrent access, we copied the
155 * key/data, toss the page.
157 if (F_ISSET(t, B_DB_LOCK))
158 mpool_put(t->bt_mp, e.page, 0);
160 t->bt_pinned = e.page;
167 * Set the sequential scan to a specific key.
171 * ep: storage for returned key
172 * key: key for initial scan position
173 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
176 * Pins the page the cursor references.
179 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
182 __bt_seqset(t, ep, key, flags)
193 * Find the first, last or specific key in the tree and point the
194 * cursor at it. The cursor may not be moved until a new key has
198 case R_CURSOR: /* Keyed scan. */
200 * Find the first instance of the key or the smallest key
201 * which is greater than or equal to the specified key.
203 if (key->data == NULL || key->size == 0) {
207 return (__bt_first(t, key, ep, &exact));
208 case R_FIRST: /* First record. */
212 /* Walk down the left-hand side of the tree. */
213 for (pg = P_ROOT;;) {
214 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
217 /* Check for an empty tree. */
218 if (NEXTINDEX(h) == 0) {
219 mpool_put(t->bt_mp, h, 0);
220 return (RET_SPECIAL);
223 if (h->flags & (P_BLEAF | P_RLEAF))
225 pg = GETBINTERNAL(h, 0)->pgno;
226 BT_PUSH(t, h->pgno, 0);
227 mpool_put(t->bt_mp, h, 0);
232 case R_LAST: /* Last record. */
236 /* Walk down the right-hand side of the tree. */
237 for (pg = P_ROOT;;) {
238 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
241 /* Check for an empty tree. */
242 if (NEXTINDEX(h) == 0) {
243 mpool_put(t->bt_mp, h, 0);
244 return (RET_SPECIAL);
247 if (h->flags & (P_BLEAF | P_RLEAF))
249 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
250 BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
251 mpool_put(t->bt_mp, h, 0);
255 ep->index = NEXTINDEX(h) - 1;
258 return (RET_SUCCESS);
263 * Advance the sequential scan.
267 * flags: R_NEXT, R_PREV
270 * Pins the page the new key/data record is on.
273 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
276 __bt_seqadv(t, ep, flags)
288 * There are a couple of states that we can be in. The cursor has
289 * been initialized by the time we get here, but that's all we know.
294 * The cursor was deleted and there weren't any duplicate records,
295 * so the cursor's key was saved. Find out where that key would
296 * be in the current tree. If the returned key is an exact match,
297 * it means that a key/data pair was inserted into the tree after
298 * the delete. We could reasonably return the key, but the problem
299 * is that this is the access pattern we'll see if the user is
300 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
301 * the cursor record and then replaces it, so the cursor was saved,
302 * and we'll simply return the same "new" record until the user
303 * notices and doesn't do a put() of it. Since the key is an exact
304 * match, we could as easily put the new record before the cursor,
305 * and we've made no guarantee to return it. So, move forward or
306 * back a record if it's an exact match.
309 * In the current implementation, put's to the cursor are done with
310 * delete/add pairs. This has two consequences. First, it means
311 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
312 * the same behavior as above. Second, you can return the same key
313 * twice if you have duplicate records. The scenario is that the
314 * cursor record is deleted, moving the cursor forward or backward
315 * to a duplicate. The add then inserts the new record at a location
316 * ahead of the cursor because duplicates aren't sorted in any way,
317 * and the new record is later returned. This has to be fixed at some
320 if (F_ISSET(c, CURS_ACQUIRE)) {
321 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
327 * Kluge -- get, release, get the page.
329 c->pg.pgno = ep->page->pgno;
330 c->pg.index = ep->index;
331 mpool_put(t->bt_mp, ep->page, 0);
334 /* Get the page referenced by the cursor. */
335 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
339 * Find the next/previous record in the tree and point the cursor at
340 * it. The cursor may not be moved until a new key has been found.
343 case R_NEXT: /* Next record. */
346 * The cursor was deleted in duplicate records, and moved
347 * forward to a record that has yet to be returned. Clear
348 * that flag, and return the record.
350 if (F_ISSET(c, CURS_AFTER))
353 if (++idx == NEXTINDEX(h)) {
354 if (flags == R_RNEXT) {
357 return (bt_rseq_next(t, ep));
360 mpool_put(t->bt_mp, h, 0);
362 return (RET_SPECIAL);
363 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
368 case R_PREV: /* Previous record. */
371 * The cursor was deleted in duplicate records, and moved
372 * backward to a record that has yet to be returned. Clear
373 * that flag, and return the record.
375 if (F_ISSET(c, CURS_BEFORE)) {
376 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
378 ep->index = c->pg.index;
379 return (RET_SUCCESS);
383 if (flags == R_RPREV) {
386 return (bt_rseq_prev(t, ep));
389 mpool_put(t->bt_mp, h, 0);
391 return (RET_SPECIAL);
392 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
394 idx = NEXTINDEX(h) - 1;
402 return (RET_SUCCESS);
406 * Get the first item on the next page, but by going up and down the tree.
409 bt_rseq_next(BTREE *t, EPG *ep)
419 /* Move up the tree. */
421 mpool_put(t->bt_mp, h, 0);
422 /* Did we hit the right edge of the root? */
424 return (RET_SPECIAL);
425 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
428 } while (++idx == NEXTINDEX(h));
430 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
431 /* Move back down the tree. */
432 BT_PUSH(t, h->pgno, idx);
433 pg = GETBINTERNAL(h, idx)->pgno;
434 mpool_put(t->bt_mp, h, 0);
435 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
441 return (RET_SUCCESS);
445 * Get the last item on the previous page, but by going up and down the tree.
448 bt_rseq_prev(BTREE *t, EPG *ep)
458 /* Move up the tree. */
460 mpool_put(t->bt_mp, h, 0);
461 /* Did we hit the left edge of the root? */
463 return (RET_SPECIAL);
464 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
469 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
470 /* Move back down the tree. */
471 BT_PUSH(t, h->pgno, idx);
472 pg = GETBINTERNAL(h, idx)->pgno;
473 mpool_put(t->bt_mp, h, 0);
474 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
476 idx = NEXTINDEX(h) - 1;
480 return (RET_SUCCESS);
485 * Find the first entry.
491 * exactp: pointer to exact match flag
494 * The first entry in the tree greater than or equal to key,
495 * or RET_SPECIAL if no such key exists.
498 __bt_first(t, key, erval, exactp)
509 * Find any matching record; __bt_search pins the page.
511 * If it's an exact match and duplicates are possible, walk backwards
512 * in the tree until we find the first one. Otherwise, make sure it's
513 * a valid key (__bt_search may return an index just past the end of a
514 * page) and return it.
516 if ((ep = __bt_search(t, key, exactp)) == NULL)
517 return (RET_SPECIAL);
519 if (F_ISSET(t, B_NODUPS)) {
521 return (RET_SUCCESS);
525 * Walk backwards, as long as the entry matches and there are
526 * keys left in the tree. Save a copy of each match in case
532 if (save.page->pgno != ep->page->pgno) {
533 mpool_put(t->bt_mp, save.page, 0);
536 save.index = ep->index;
539 * Don't unpin the page the last (or original) match
540 * was on, but make sure it's unpinned if an error
543 if (ep->index == 0) {
544 if (h->prevpg == P_INVALID)
546 if (h->pgno != save.page->pgno)
547 mpool_put(t->bt_mp, h, 0);
548 if ((hprev = mpool_get(t->bt_mp,
549 h->prevpg, 0)) == NULL) {
550 if (h->pgno == save.page->pgno)
555 ep->page = h = hprev;
556 ep->index = NEXTINDEX(h);
559 } while (__bt_cmp(t, key, ep) == 0);
562 * Reach here with the last page that was looked at pinned,
563 * which may or may not be the same as the last (or original)
564 * match page. If it's not useful, release it.
566 if (h->pgno != save.page->pgno)
567 mpool_put(t->bt_mp, h, 0);
570 return (RET_SUCCESS);
573 /* If at the end of a page, find the next entry. */
574 if (ep->index == NEXTINDEX(ep->page)) {
577 mpool_put(t->bt_mp, h, 0);
579 return (RET_SPECIAL);
580 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
586 return (RET_SUCCESS);
591 * Set the cursor to an entry in the tree.
599 __bt_setcur(t, pgno, idx)
604 /* Lose any already deleted key. */
605 if (t->bt_cursor.key.data != NULL) {
606 free(t->bt_cursor.key.data);
607 t->bt_cursor.key.size = 0;
608 t->bt_cursor.key.data = NULL;
610 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
612 /* Update the cursor. */
613 t->bt_cursor.pg.pgno = pgno;
614 t->bt_cursor.pg.index = idx;
615 F_SET(&t->bt_cursor, CURS_INIT);