Remove definition of builtin function
[platform/upstream/db4.git] / btree / bt_search.c
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5  */
6 /*
7  * Copyright (c) 1990, 1993, 1994, 1995, 1996
8  *      Keith Bostic.  All rights reserved.
9  */
10 /*
11  * Copyright (c) 1990, 1993, 1994, 1995
12  *      The Regents of the University of California.  All rights reserved.
13  *
14  * This code is derived from software contributed to Berkeley by
15  * Mike Olson.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. Neither the name of the University nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  *
41  * $Id$
42  */
43
44 #include "db_config.h"
45
46 #include "db_int.h"
47 #include "dbinc/db_page.h"
48 #include "dbinc/btree.h"
49 #include "dbinc/lock.h"
50 #include "dbinc/mp.h"
51
52 /*
53  * __bam_get_root --
54  *      Fetch the root of a tree and see if we want to keep
55  * it in the stack.
56  *
57  * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
58  */
59 int
60 __bam_get_root(dbc, pg, slevel, flags, stack)
61         DBC *dbc;
62         db_pgno_t pg;
63         int slevel;
64         u_int32_t flags;
65         int *stack;
66 {
67         BTREE_CURSOR *cp;
68         DB *dbp;
69         DB_LOCK lock;
70         DB_MPOOLFILE *mpf;
71         PAGE *h;
72         db_lockmode_t lock_mode;
73         u_int32_t get_mode;
74         int ret, t_ret;
75
76         LOCK_INIT(lock);
77         dbp = dbc->dbp;
78         mpf = dbp->mpf;
79         cp = (BTREE_CURSOR *)dbc->internal;
80         /*
81          * If write-locking pages, we need to know whether or not to acquire a
82          * write lock on a page before getting it.  This depends on how deep it
83          * is in tree, which we don't know until we acquire the root page.  So,
84          * if we need to lock the root page we may have to upgrade it later,
85          * because we won't get the correct lock initially.
86          *
87          * Retrieve the root page.
88          */
89 try_again:
90         *stack = LF_ISSET(SR_STACK) &&
91               (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM));
92         lock_mode = DB_LOCK_READ;
93         if (*stack ||
94             LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE)))
95                 lock_mode = DB_LOCK_WRITE;
96         if ((lock_mode == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV) ||
97             dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))) {
98 lock_it:        if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
99                         return (ret);
100         }
101
102         /*
103          * Get the root.  If the root happens to be a leaf page then
104          * we are supposed to get a read lock on it before latching
105          * it.  So if we have not locked it do a try get first.
106          * If we can't get the root shared, then get a lock on it and
107          * then wait for the latch.
108          */
109         if (lock_mode == DB_LOCK_WRITE)
110                 get_mode = DB_MPOOL_DIRTY;
111         else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc))
112                 get_mode = 0;
113         else
114                 get_mode = DB_MPOOL_TRY;
115
116         if ((ret = __memp_fget(mpf, &pg,
117             dbc->thread_info, dbc->txn, get_mode, &h)) != 0) {
118                 if (ret == DB_LOCK_NOTGRANTED)
119                         goto lock_it;
120                 /* Did not read it, so we can release the lock */
121                 (void)__LPUT(dbc, lock);
122                 return (ret);
123         }
124
125         /*
126          * Decide if we need to dirty and/or lock this page.
127          * We must not hold the latch while we get the lock.
128          */
129         if (!*stack &&
130             ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
131             LEVEL(h) == LEAFLEVEL ||
132             (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
133                 *stack = 1;
134                 /* If we already have the write lock, we are done. */
135                 if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) {
136                         if (lock_mode == DB_LOCK_WRITE)
137                                 goto done;
138                         if ((ret = __LPUT(dbc, lock)) != 0)
139                                 return (ret);
140                 }
141
142                 /*
143                  * Now that we know what level the root is at, do we need a
144                  * write lock?  If not and we got the lock before latching
145                  * we are done.
146                  */
147                 if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) {
148                         lock_mode = DB_LOCK_WRITE;
149                         /* Drop the read lock if we got it above. */
150                         if ((ret = __LPUT(dbc, lock)) != 0)
151                                 return (ret);
152                 } else if (LOCK_ISSET(lock))
153                         goto done;
154                 if (!STD_LOCKING(dbc)) {
155                         if (lock_mode != DB_LOCK_WRITE)
156                                 goto done;
157                         if ((ret = __memp_dirty(mpf, &h, dbc->thread_info,
158                             dbc->txn, dbc->priority, 0)) != 0) {
159                                 if (h != NULL)
160                                         (void)__memp_fput(mpf,
161                                             dbc->thread_info, h, dbc->priority);
162                                 return (ret);
163                         }
164                 } else {
165                         /* Try to lock the page without waiting first. */
166                         if ((ret = __db_lget(dbc,
167                             0, pg, lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) {
168                                 if (lock_mode == DB_LOCK_WRITE && (ret =
169                                     __memp_dirty(mpf, &h, dbc->thread_info,
170                                     dbc->txn, dbc->priority, 0)) != 0) {
171                                         if (h != NULL)
172                                                 (void)__memp_fput(mpf,
173                                                     dbc->thread_info, h,
174                                                     dbc->priority);
175                                         return (ret);
176                                 }
177                                 goto done;
178                         }
179
180                         t_ret = __memp_fput(mpf,
181                             dbc->thread_info, h, dbc->priority);
182
183                         if (ret == DB_LOCK_DEADLOCK ||
184                             ret == DB_LOCK_NOTGRANTED)
185                                 ret = 0;
186                         if (ret == 0)
187                                 ret = t_ret;
188
189                         if (ret != 0)
190                                 return (ret);
191
192                         if ((ret = __db_lget(dbc,
193                              0, pg, lock_mode, 0, &lock)) != 0)
194                                 return (ret);
195                         if ((ret = __memp_fget(mpf,
196                              &pg, dbc->thread_info, dbc->txn,
197                              lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0,
198                              &h)) != 0) {
199                                 /* Did not read it, release the lock */
200                                 (void)__LPUT(dbc, lock);
201                                 return (ret);
202                         }
203                 }
204                 /*
205                  * While getting dirty or locked we need to drop the mutex
206                  * so someone else could get in and split the root.
207                  */
208                 if (!((LF_ISSET(SR_PARENT) &&
209                     (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
210                     LEVEL(h) == LEAFLEVEL ||
211                     (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
212                         /* Someone else split the root, start over. */
213                         ret = __memp_fput(mpf,
214                             dbc->thread_info, h, dbc->priority);
215                         if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
216                                 ret = t_ret;
217                         if (ret != 0)
218                                 return (ret);
219                         goto try_again;
220                 }
221         }
222
223 done:   BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret);
224
225         return (ret);
226 }
227
228 /*
229  * __bam_search --
230  *      Search a btree for a key.
231  *
232  * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
233  * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
234  */
235 int
236 __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
237         DBC *dbc;
238         db_pgno_t root_pgno;
239         const DBT *key;
240         u_int32_t flags;
241         int slevel, *exactp;
242         db_recno_t *recnop;
243 {
244         BTREE *t;
245         BTREE_CURSOR *cp;
246         DB *dbp;
247         DB_LOCK lock, saved_lock;
248         DB_MPOOLFILE *mpf;
249         ENV *env;
250         PAGE *h, *parent_h;
251         db_indx_t base, i, indx, *inp, lim;
252         db_lockmode_t lock_mode;
253         db_pgno_t pg, saved_pg;
254         db_recno_t recno;
255         int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
256         int getlock, was_next;
257         int (*func) __P((DB *, const DBT *, const DBT *));
258         u_int32_t get_mode, wait;
259         u_int8_t level, saved_level;
260
261         dbp = dbc->dbp;
262         env = dbp->env;
263         mpf = dbp->mpf;
264         cp = (BTREE_CURSOR *)dbc->internal;
265         h = NULL;
266         parent_h = NULL;
267         t = dbp->bt_internal;
268         recno = 0;
269         t_ret = 0;
270
271         BT_STK_CLR(cp);
272         LOCK_INIT(saved_lock);
273         LOCK_INIT(lock);
274         was_next = LF_ISSET(SR_NEXT);
275         wait = DB_LOCK_NOWAIT;
276
277         /*
278          * There are several ways we search a btree tree.  The flags argument
279          * specifies if we're acquiring read or write latches, if we position
280          * to the first or last item in a set of duplicates, if we return
281          * deleted items, and if we are latching pairs of pages.  In addition,
282          * if we're modifying record numbers, we have to latch the entire tree
283          * regardless.  See btree.h for more details.
284          */
285
286         if (root_pgno == PGNO_INVALID)
287                 root_pgno = cp->root;
288         saved_pg = root_pgno;
289         saved_level = MAXBTREELEVEL;
290 retry:  if ((ret = __bam_get_root(dbc, root_pgno, slevel, flags, &stack)) != 0)
291                 goto err;
292         lock_mode = cp->csp->lock_mode;
293         get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
294         h = cp->csp->page;
295         pg = PGNO(h);
296         lock = cp->csp->lock;
297         set_stack = stack;
298         /*
299          * Determine if we need to lock interiror nodes.
300          * If we have record numbers we always lock.  Otherwise we only
301          * need to do this if we are write locking and we are returning
302          * a stack of nodes.  SR_NEXT will eventually get a stack and
303          * release the locks above that level.
304          */
305         if (F_ISSET(dbc, DBC_DOWNREV)) {
306                 getlock = 1;
307                 wait = 0;
308         } else
309                 getlock = F_ISSET(cp, C_RECNUM) ||
310                    (lock_mode == DB_LOCK_WRITE &&
311                    (stack || LF_ISSET(SR_NEXT | SR_DEL)));
312
313         /*
314          * If we are asked a level that is above the root,
315          * just return the root.  This can happen if the tree
316          * collapses while we are trying to lock the root.
317          */
318         if (!LF_ISSET(SR_START) && LEVEL(h) < slevel)
319                 goto done;
320
321         BT_STK_CLR(cp);
322
323         /* Choose a comparison function. */
324         func = F_ISSET(dbc, DBC_OPD) ?
325             (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
326             t->bt_compare;
327
328         for (;;) {
329                 if (TYPE(h) == P_LBTREE)
330                         adjust = P_INDX;
331                 else {
332                         /*
333                          * It is possible to catch an internal page as a change
334                          * is being backed out.  Its leaf pages will be locked
335                          * but we must be sure we get to one.  If the page
336                          * is not populated enough lock it.
337                          */
338                         if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) {
339                                 getlock = 1;
340                                 level = LEVEL(h) + 1;
341                                 if ((ret = __memp_fput(mpf, dbc->thread_info,
342                                      h, dbc->priority)) != 0)
343                                         goto err;
344                                 goto lock_next;
345                         }
346                         adjust = O_INDX;
347                 }
348                 inp = P_INP(dbp, h);
349                 if (LF_ISSET(SR_MIN | SR_MAX)) {
350                         if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0)
351                                 indx = 0;
352                         else if (TYPE(h) == P_LBTREE)
353                                 indx = NUM_ENT(h) - 2;
354                         else
355                                 indx = NUM_ENT(h) - 1;
356
357                         if (LEVEL(h) == LEAFLEVEL ||
358                              (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
359                                 if (LF_ISSET(SR_NEXT))
360                                         goto get_next;
361                                 goto found;
362                         }
363                         goto next;
364                 }
365                 /*
366                  * Do a binary search on the current page.  If we're searching
367                  * a Btree leaf page, we have to walk the indices in groups of
368                  * two.  If we're searching an internal page or a off-page dup
369                  * page, they're an index per page item.  If we find an exact
370                  * match on a leaf page, we're done.
371                  */
372                 DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
373                         DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
374                         if ((ret = __bam_cmp(dbc, key, h, indx,
375                             func, &cmp)) != 0)
376                                 goto err;
377                         if (cmp == 0) {
378                                 if (LEVEL(h) == LEAFLEVEL ||
379                                     (!LF_ISSET(SR_START) &&
380                                     LEVEL(h) == slevel)) {
381                                         if (LF_ISSET(SR_NEXT))
382                                                 goto get_next;
383                                         goto found;
384                                 }
385                                 goto next;
386                         }
387                         if (cmp > 0)
388                                 DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
389                                     lim, adjust);
390                 }
391
392                 /*
393                  * No match found.  Base is the smallest index greater than
394                  * key and may be zero or a last + O_INDX index.
395                  *
396                  * If it's a leaf page or the stopping point,
397                  * return base as the "found" value.
398                  * Delete only deletes exact matches.
399                  */
400                 if (LEVEL(h) == LEAFLEVEL ||
401                     (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
402                         *exactp = 0;
403
404                         if (LF_ISSET(SR_EXACT)) {
405                                 ret = DB_NOTFOUND;
406                                 goto err;
407                         }
408
409                         if (LF_ISSET(SR_STK_ONLY)) {
410                                 BT_STK_NUM(env, cp, h, base, ret);
411                                 if ((t_ret =
412                                     __LPUT(dbc, lock)) != 0 && ret == 0)
413                                         ret = t_ret;
414                                 if ((t_ret = __memp_fput(mpf, dbc->thread_info,
415                                      h, dbc->priority)) != 0 && ret == 0)
416                                         ret = t_ret;
417                                 h = NULL;
418                                 if (ret != 0)
419                                         goto err;
420                                 goto done;
421                         }
422                         if (LF_ISSET(SR_NEXT)) {
423 get_next:                       /*
424                                  * The caller could have asked for a NEXT
425                                  * at the root if the tree recently collapsed.
426                                  */
427                                 if (PGNO(h) == root_pgno) {
428                                         ret = DB_NOTFOUND;
429                                         goto err;
430                                 }
431
432                                 indx = cp->sp->indx + 1;
433                                 if (indx == NUM_ENT(cp->sp->page)) {
434                                         ret = DB_NOTFOUND;
435                                         cp->csp++;
436                                         goto err;
437                                 }
438                                 /*
439                                  * If we want both the key page and the next
440                                  * page, push the key page on the stack
441                                  * otherwise save the root of the subtree
442                                  * and drop the rest of the subtree.
443                                  * Search down again starting at the
444                                  * next child of the root of this subtree.
445                                  */
446                                 LF_SET(SR_MIN);
447                                 LF_CLR(SR_NEXT);
448                                 set_stack = stack = 1;
449                                 if (LF_ISSET(SR_BOTH)) {
450                                         cp->csp++;
451                                         BT_STK_PUSH(env,
452                                             cp, h, indx, lock, lock_mode, ret);
453                                         if (ret != 0)
454                                                 goto err;
455                                         LOCK_INIT(lock);
456                                         h = cp->sp->page;
457                                         pg = GET_BINTERNAL(dbp, h, indx)->pgno;
458                                         level = LEVEL(h);
459                                         h = NULL;
460                                         goto lock_next;
461                                 } else {
462                                         if ((ret = __LPUT(dbc, lock)) != 0)
463                                                 goto err;
464                                         if ((ret = __memp_fput(mpf,
465                                             dbc->thread_info,
466                                             h, dbc->priority)) != 0)
467                                                 goto err;
468                                         h = cp->sp->page;
469                                         cp->sp->page = NULL;
470                                         lock = cp->sp->lock;
471                                         LOCK_INIT(cp->sp->lock);
472                                         if ((ret = __bam_stkrel(dbc,
473                                             STK_NOLOCK)) != 0)
474                                                 goto err;
475                                         goto next;
476                                 }
477                         }
478
479                         /*
480                          * !!!
481                          * Possibly returning a deleted record -- DB_SET_RANGE,
482                          * DB_KEYFIRST and DB_KEYLAST don't require an exact
483                          * match, and we don't want to walk multiple pages here
484                          * to find an undeleted record.  This is handled by the
485                          * calling routine.
486                          */
487                         if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
488                                 cp->csp++;
489                         BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret);
490                         if (ret != 0)
491                                 goto err;
492                         goto done;
493                 }
494
495                 /*
496                  * If it's not a leaf page, record the internal page (which is
497                  * a parent page for the key).  Decrement the base by 1 if it's
498                  * non-zero so that if a split later occurs, the inserted page
499                  * will be to the right of the saved page.
500                  */
501                 indx = base > 0 ? base - O_INDX : base;
502
503                 /*
504                  * If we're trying to calculate the record number, sum up
505                  * all the record numbers on this page up to the indx point.
506                  */
507 next:           if (recnop != NULL)
508                         for (i = 0; i < indx; ++i)
509                                 recno += GET_BINTERNAL(dbp, h, i)->nrecs;
510
511                 pg = GET_BINTERNAL(dbp, h, indx)->pgno;
512                 level = LEVEL(h);
513
514                 /* See if we are at the level to start stacking. */
515                 if (LF_ISSET(SR_START) && slevel == level)
516                         set_stack = stack = 1;
517
518                 if (LF_ISSET(SR_STK_ONLY)) {
519                         if (slevel == LEVEL(h)) {
520                                 BT_STK_NUM(env, cp, h, indx, ret);
521                                 if ((t_ret = __memp_fput(mpf, dbc->thread_info,
522                                     h, dbc->priority)) != 0 && ret == 0)
523                                         ret = t_ret;
524                                 h = NULL;
525                                 if (ret != 0)
526                                         goto err;
527                                 goto done;
528                         }
529                         BT_STK_NUMPUSH(env, cp, h, indx, ret);
530                         (void)__memp_fput(mpf,
531                             dbc->thread_info, h, dbc->priority);
532                         h = NULL;
533                 } else if (stack) {
534                         /* Return if this is the lowest page wanted. */
535                         if (LF_ISSET(SR_PARENT) && slevel == level) {
536                                 BT_STK_ENTER(env,
537                                     cp, h, indx, lock, lock_mode, ret);
538                                 if (ret != 0)
539                                         goto err;
540                                 goto done;
541                         }
542                         if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) {
543                                 /*
544                                  * There was a page with a singleton pointer
545                                  * to a non-empty subtree.
546                                  */
547                                 cp->csp--;
548                                 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
549                                         goto err;
550                                 set_stack = stack = 0;
551                                 goto do_del;
552                         }
553                         BT_STK_PUSH(env,
554                             cp, h, indx, lock, lock_mode, ret);
555                         if (ret != 0)
556                                 goto err;
557
558                         LOCK_INIT(lock);
559                         get_mode = DB_MPOOL_DIRTY;
560                         lock_mode = DB_LOCK_WRITE;
561                         goto lock_next;
562                 } else {
563                         /*
564                          * Decide if we want to return a reference to the next
565                          * page in the return stack.  If so, latch it and don't
566                          * unlatch it.  We will want to stack things on the
567                          * next iteration.  The stack variable cannot be
568                          * set until we leave this clause. If we are locking
569                          * then we must lock this level before getting the page.
570                          */
571                         if ((LF_ISSET(SR_PARENT) &&
572                             (u_int8_t)(slevel + 1) >= (level - 1)) ||
573                             (level - 1) == LEAFLEVEL)
574                                 set_stack = 1;
575
576                         /*
577                          * Check for a normal search.  If so, we need to
578                          * latch couple the parent/chid buffers.
579                          */
580                         if (!LF_ISSET(SR_DEL | SR_NEXT)) {
581                                 parent_h = h;
582                                 goto lock_next;
583                         }
584
585                         /*
586                          * Returning a subtree.  See if we have hit the start
587                          * point if so save the parent and set stack.
588                          * Otherwise free the parent and temporarily
589                          * save this one.
590                          * For SR_DEL we need to find a page with 1 entry.
591                          * For SR_NEXT we want find the minimal subtree
592                          * that contains the key and the next page.
593                          * We save pages as long as we are at the right
594                          * edge of the subtree.  When we leave the right
595                          * edge, then drop the subtree.
596                          */
597
598                         if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) {
599                                 /*
600                                  * We are pushing the things on the stack,
601                                  * set the stack variable now to indicate this
602                                  * has happened.
603                                  */
604                                 stack = set_stack = 1;
605                                 LF_SET(SR_WRITE);
606                                 /* Push the parent. */
607                                 cp->csp++;
608                                 /* Push this node. */
609                                 BT_STK_PUSH(env, cp, h,
610                                      indx, lock, DB_LOCK_NG, ret);
611                                 if (ret != 0)
612                                         goto err;
613                                 LOCK_INIT(lock);
614                         } else {
615                         /*
616                          * See if we want to save the tree so far.
617                          * If we are looking for the next key,
618                          * then we must save this node if we are
619                          * at the end of the page.  If not then
620                          * discard anything we have saved so far.
621                          * For delete only keep one node until
622                          * we find a singleton.
623                          */
624 do_del:                         if (cp->csp->page != NULL) {
625                                         if (LF_ISSET(SR_NEXT) &&
626                                              indx == NUM_ENT(h) - 1)
627                                                 cp->csp++;
628                                         else if ((ret =
629                                             __bam_stkrel(dbc, STK_NOLOCK)) != 0)
630                                                 goto err;
631                                 }
632                                 /* Save this node. */
633                                 BT_STK_ENTER(env, cp,
634                                     h, indx, lock, lock_mode, ret);
635                                 if (ret != 0)
636                                         goto err;
637                                 LOCK_INIT(lock);
638                         }
639
640 lock_next:              h = NULL;
641
642                         if (set_stack && LF_ISSET(SR_WRITE)) {
643                                 lock_mode = DB_LOCK_WRITE;
644                                 get_mode = DB_MPOOL_DIRTY;
645                                 getlock = 1;
646                         }
647                         /*
648                          * If we are retrying and we are back at the same
649                          * page then we already have it locked.  If we are
650                          * at a different page we want to lock couple and
651                          * release that lock.
652                          */
653                         if (level - 1 == saved_level) {
654                                 if ((ret = __LPUT(dbc, lock)) != 0)
655                                         goto err;
656                                 lock = saved_lock;
657                                 LOCK_INIT(saved_lock);
658                                 saved_level = MAXBTREELEVEL;
659                                 if (pg == saved_pg)
660                                         goto skip_lock;
661                         }
662                         if ((getlock || level - 1 == LEAFLEVEL) &&
663                             (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS,
664                             pg, lock_mode, wait, &lock)) != 0) {
665                                 /*
666                                  * If we are doing DEL or NEXT then we
667                                  * have an extra level saved in the stack,
668                                  * push it so it will get freed.
669                                  */
670                                 if (LF_ISSET(SR_DEL | SR_NEXT) && !stack)
671                                         cp->csp++;
672                                 /*
673                                  * If we fail, discard the lock we held.
674                                  * This is ok because we will either search
675                                  * again or exit without actually looking
676                                  * at the data.
677                                  */
678                                 if ((t_ret = __LPUT(dbc, lock)) != 0 &&
679                                     ret == 0)
680                                         ret = t_ret;
681                                 /*
682                                  * If we blocked at a different level release
683                                  * the previous saved lock.
684                                  */
685                                 if ((t_ret = __LPUT(dbc, saved_lock)) != 0 &&
686                                     ret == 0)
687                                         ret = t_ret;
688                                 if (wait == 0 || (ret != DB_LOCK_NOTGRANTED &&
689                                      ret != DB_LOCK_DEADLOCK))
690                                         goto err;
691
692                                 /* Relase the parent if we are holding it. */
693                                 if (parent_h != NULL &&
694                                     (ret = __memp_fput(mpf, dbc->thread_info,
695                                     parent_h, dbc->priority)) != 0)
696                                         goto err;
697                                 parent_h = NULL;
698
699                                 BT_STK_POP(cp);
700                                 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
701                                         goto err;
702                                 if ((ret = __db_lget(dbc,
703                                     0, pg, lock_mode, 0, &saved_lock)) != 0)
704                                         goto err;
705                                 /*
706                                  * A very strange case: if this page was
707                                  * freed while we wait then we cannot hold
708                                  * the lock on it while we reget the root
709                                  * latch because allocation is one place
710                                  * we lock while holding a latch.
711                                  * Noone can have a free page locked, so
712                                  * check for that case.  We do this by
713                                  * checking the level, since it will be 0
714                                  * if free and we might as well see if this
715                                  * page moved and drop the lock in that case.
716                                  */
717                                 if ((ret = __memp_fget(mpf, &pg,
718                                      dbc->thread_info,
719                                      dbc->txn, get_mode, &h)) != 0 &&
720                                      ret != DB_PAGE_NOTFOUND)
721                                         goto err;
722
723                                 if (ret != 0 || LEVEL(h) != level - 1) {
724                                         ret = __LPUT(dbc, saved_lock);
725                                         if (ret != 0)
726                                                 goto err;
727                                         pg = root_pgno;
728                                         saved_level = MAXBTREELEVEL;
729                                 }
730                                 if (h != NULL && (ret = __memp_fput(mpf,
731                                     dbc->thread_info, h, dbc->priority)) != 0)
732                                         goto err;
733                                 h = NULL;
734
735                                 if (was_next) {
736                                         LF_CLR(SR_MIN);
737                                         LF_SET(SR_NEXT);
738                                 }
739                                 /*
740                                  * We have the lock but we dropped the
741                                  * latch so we need to search again. If
742                                  * we get back to the same page then all
743                                  * is good, otherwise we need to try to
744                                  * lock the new page.
745                                  */
746                                 saved_pg = pg;
747                                 saved_level = level - 1;
748                                 goto retry;
749                         }
750 skip_lock:              stack = set_stack;
751                 }
752                 /* Get the child page. */
753                 if ((ret = __memp_fget(mpf, &pg,
754                      dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
755                         goto err;
756                 /* Release the parent. */
757                 if (parent_h != NULL && (ret = __memp_fput(mpf,
758                     dbc->thread_info, parent_h, dbc->priority)) != 0)
759                         goto err;
760                 parent_h = NULL;
761         }
762         /* NOTREACHED */
763
764 found:  *exactp = 1;
765
766         /*
767          * If we got here, we know that we have a Btree leaf or off-page
768          * duplicates page.  If it's a Btree leaf page, we have to handle
769          * on-page duplicates.
770          *
771          * If there are duplicates, go to the first/last one.  This is
772          * safe because we know that we're not going to leave the page,
773          * all duplicate sets that are not on overflow pages exist on a
774          * single leaf page.
775          */
776         if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) {
777                 if (LF_ISSET(SR_DUPLAST))
778                         while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
779                             inp[indx] == inp[indx + P_INDX])
780                                 indx += P_INDX;
781                 else if (LF_ISSET(SR_DUPFIRST))
782                         while (indx > 0 &&
783                             inp[indx] == inp[indx - P_INDX])
784                                 indx -= P_INDX;
785         }
786
787         /*
788          * Now check if we are allowed to return deleted items; if not, then
789          * find the next (or previous) non-deleted duplicate entry.  (We do
790          * not move from the original found key on the basis of the SR_DELNO
791          * flag.)
792          */
793         DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO));
794         if (LF_ISSET(SR_DELNO)) {
795                 deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
796                 if (LF_ISSET(SR_DUPLAST))
797                         while (B_DISSET(GET_BKEYDATA(dbp,
798                             h, indx + deloffset)->type) && indx > 0 &&
799                             inp[indx] == inp[indx - adjust])
800                                 indx -= adjust;
801                 else
802                         while (B_DISSET(GET_BKEYDATA(dbp,
803                             h, indx + deloffset)->type) &&
804                             indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
805                             inp[indx] == inp[indx + adjust])
806                                 indx += adjust;
807
808                 /*
809                  * If we weren't able to find a non-deleted duplicate, return
810                  * DB_NOTFOUND.
811                  */
812                 if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
813                         ret = DB_NOTFOUND;
814                         goto err;
815                 }
816
817                 /*
818                  * Increment the record counter to point to the found element.
819                  * Ignore any deleted key/data pairs.  There doesn't need to
820                  * be any correction for duplicates, as Btree doesn't support
821                  * duplicates and record numbers in the same tree.
822                  */
823                 if (recnop != NULL) {
824                         DB_ASSERT(env, TYPE(h) == P_LBTREE);
825
826                         for (i = 0; i < indx; i += P_INDX)
827                                 if (!B_DISSET(
828                                     GET_BKEYDATA(dbp, h, i + O_INDX)->type))
829                                         ++recno;
830
831                         /* Correct the number for a 0-base. */
832                         *recnop = recno + 1;
833                 }
834         }
835
836         if (LF_ISSET(SR_STK_ONLY)) {
837                 BT_STK_NUM(env, cp, h, indx, ret);
838                 if ((t_ret = __memp_fput(mpf,
839                      dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
840                         ret = t_ret;
841         } else {
842                 if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
843                         cp->csp++;
844                 BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
845         }
846         if (ret != 0)
847                 goto err;
848
849         cp->csp->lock = lock;
850         DB_ASSERT(env, parent_h == NULL);
851
852 done:   if ((ret = __LPUT(dbc, saved_lock)) != 0)
853                 return (ret);
854
855         return (0);
856
857 err:    if (ret == 0)
858                 ret = t_ret;
859         if (h != NULL && (t_ret = __memp_fput(mpf,
860             dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
861                 ret = t_ret;
862         if (parent_h != NULL && (t_ret = __memp_fput(mpf,
863             dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0)
864                 ret = t_ret;
865
866         /* Keep any not-found page locked for serializability. */
867         if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
868                 ret = t_ret;
869
870         (void)__LPUT(dbc, saved_lock);
871
872         BT_STK_POP(cp);
873         (void)__bam_stkrel(dbc, 0);
874
875         return (ret);
876 }
877
878 /*
879  * __bam_stkrel --
880  *      Release all pages currently held in the stack.
881  *
882  * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
883  */
884 int
885 __bam_stkrel(dbc, flags)
886         DBC *dbc;
887         u_int32_t flags;
888 {
889         BTREE_CURSOR *cp;
890         DB *dbp;
891         DB_MPOOLFILE *mpf;
892         EPG *epg;
893         int ret, t_ret;
894
895         DB_ASSERT(NULL, dbc != NULL);
896         dbp = dbc->dbp;
897         mpf = dbp->mpf;
898         cp = (BTREE_CURSOR *)dbc->internal;
899
900         /*
901          * Release inner pages first.
902          *
903          * The caller must be sure that setting STK_NOLOCK will not effect
904          * either serializability or recoverability.
905          */
906         for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
907                 if (epg->page != NULL) {
908                         if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
909                                 cp->page = NULL;
910                                 LOCK_INIT(cp->lock);
911                         }
912                         if ((t_ret = __memp_fput(mpf, dbc->thread_info,
913                              epg->page, dbc->priority)) != 0 && ret == 0)
914                                 ret = t_ret;
915                         epg->page = NULL;
916                 }
917                 /*
918                  * We set this if we need to release our pins,
919                  * but are not logically ready to have the pages
920                  * visible.
921                  */
922                 if (LF_ISSET(STK_PGONLY))
923                         continue;
924                 if (LF_ISSET(STK_NOLOCK)) {
925                         if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0)
926                                 ret = t_ret;
927                 } else
928                         if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
929                                 ret = t_ret;
930         }
931
932         /* Clear the stack, all pages have been released. */
933         if (!LF_ISSET(STK_PGONLY))
934                 BT_STK_CLR(cp);
935
936         return (ret);
937 }
938
939 /*
940  * __bam_stkgrow --
941  *      Grow the stack.
942  *
943  * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
944  */
945 int
946 __bam_stkgrow(env, cp)
947         ENV *env;
948         BTREE_CURSOR *cp;
949 {
950         EPG *p;
951         size_t entries;
952         int ret;
953
954         entries = cp->esp - cp->sp;
955
956         if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0)
957                 return (ret);
958         memcpy(p, cp->sp, entries * sizeof(EPG));
959         if (cp->sp != cp->stack)
960                 __os_free(env, cp->sp);
961         cp->sp = p;
962         cp->csp = p + entries;
963         cp->esp = p + entries * 2;
964         return (0);
965 }