Remove definition of builtin function
[platform/upstream/db4.git] / btree / bt_rsearch.c
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5  */
6 /*
7  * Copyright (c) 1990, 1993, 1994, 1995, 1996
8  *      Keith Bostic.  All rights reserved.
9  */
10 /*
11  * Copyright (c) 1990, 1993
12  *      The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * $Id$
39  */
40
41 #include "db_config.h"
42
43 #include "db_int.h"
44 #include "dbinc/db_page.h"
45 #include "dbinc/btree.h"
46 #include "dbinc/lock.h"
47 #include "dbinc/mp.h"
48
49 /*
50  * __bam_rsearch --
51  *      Search a btree for a record number.
52  *
53  * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
54  */
55 int
56 __bam_rsearch(dbc, recnop, flags, stop, exactp)
57         DBC *dbc;
58         db_recno_t *recnop;
59         u_int32_t flags;
60         int stop, *exactp;
61 {
62         BINTERNAL *bi;
63         BTREE_CURSOR *cp;
64         DB *dbp;
65         DB_LOCK lock;
66         DB_MPOOLFILE *mpf;
67         ENV *env;
68         PAGE *h;
69         RINTERNAL *ri;
70         db_indx_t adjust, deloffset, indx, top;
71         db_lockmode_t lock_mode;
72         db_pgno_t pg;
73         db_recno_t recno, t_recno, total;
74         u_int32_t get_mode;
75         int ret, stack, t_ret;
76
77         dbp = dbc->dbp;
78         env = dbp->env;
79         mpf = dbp->mpf;
80         cp = (BTREE_CURSOR *)dbc->internal;
81         h = NULL;
82
83         BT_STK_CLR(cp);
84
85         /*
86          * There are several ways we search a btree tree.  The flags argument
87          * specifies if we're acquiring read or write locks and if we are
88          * locking pairs of pages.  In addition, if we're adding or deleting
89          * an item, we have to lock the entire tree, regardless.  See btree.h
90          * for more details.
91          *
92          * If write-locking pages, we need to know whether or not to acquire a
93          * write lock on a page before getting it.  This depends on how deep it
94          * is in tree, which we don't know until we acquire the root page.  So,
95          * if we need to lock the root page we may have to upgrade it later,
96          * because we won't get the correct lock initially.
97          *
98          * Retrieve the root page.
99          */
100
101         if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0)
102                 return (ret);
103         lock_mode = cp->csp->lock_mode;
104         get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
105         lock = cp->csp->lock;
106         h = cp->csp->page;
107
108         BT_STK_CLR(cp);
109         /*
110          * If appending to the tree, set the record number now -- we have the
111          * root page locked.
112          *
113          * Delete only deletes exact matches, read only returns exact matches.
114          * Note, this is different from __bam_search(), which returns non-exact
115          * matches for read.
116          *
117          * The record may not exist.  We can only return the correct location
118          * for the record immediately after the last record in the tree, so do
119          * a fast check now.
120          */
121         total = RE_NREC(h);
122         if (LF_ISSET(SR_APPEND)) {
123                 *exactp = 0;
124                 *recnop = recno = total + 1;
125         } else {
126                 recno = *recnop;
127                 if (recno <= total)
128                         *exactp = 1;
129                 else {
130                         *exactp = 0;
131                         if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
132                                 /*
133                                  * Keep the page locked for serializability.
134                                  *
135                                  * XXX
136                                  * This leaves the root page locked, which will
137                                  * eliminate any concurrency.  A possible fix
138                                  * would be to lock the last leaf page instead.
139                                  */
140                                 ret = __memp_fput(mpf,
141                                     dbc->thread_info, h, dbc->priority);
142                                 if ((t_ret =
143                                     __TLPUT(dbc, lock)) != 0 && ret == 0)
144                                         ret = t_ret;
145                                 return (ret == 0 ? DB_NOTFOUND : ret);
146                         }
147                 }
148         }
149
150         /*
151          * !!!
152          * Record numbers in the tree are 0-based, but the recno is
153          * 1-based.  All of the calculations below have to take this
154          * into account.
155          */
156         for (total = 0;;) {
157                 switch (TYPE(h)) {
158                 case P_LBTREE:
159                         if (LF_ISSET(SR_MAX)) {
160                                 indx = NUM_ENT(h) - 2;
161                                 goto enter;
162                         }
163                         /* FALLTHROUGH */
164                 case P_LDUP:
165                         if (LF_ISSET(SR_MAX)) {
166                                 indx = NUM_ENT(h) - 1;
167                                 goto enter;
168                         }
169                         recno -= total;
170                         /*
171                          * There may be logically deleted records on the page.
172                          * If there are enough, the record may not exist.
173                          */
174                         if (TYPE(h) == P_LBTREE) {
175                                 adjust = P_INDX;
176                                 deloffset = O_INDX;
177                         } else {
178                                 adjust = O_INDX;
179                                 deloffset = 0;
180                         }
181                         for (t_recno = 0, indx = 0;; indx += adjust) {
182                                 if (indx >= NUM_ENT(h)) {
183                                         *exactp = 0;
184                                         if (!LF_ISSET(SR_PAST_EOF) ||
185                                             recno > t_recno + 1) {
186                                                 ret = __memp_fput(mpf,
187                                                     dbc->thread_info,
188                                                     h, dbc->priority);
189                                                 h = NULL;
190                                                 if ((t_ret = __TLPUT(dbc,
191                                                     lock)) != 0 && ret == 0)
192                                                         ret = t_ret;
193                                                 if (ret == 0)
194                                                         ret = DB_NOTFOUND;
195                                                 goto err;
196                                         }
197                                 }
198                                 if (!B_DISSET(GET_BKEYDATA(dbp, h,
199                                     indx + deloffset)->type) &&
200                                     ++t_recno == recno)
201                                         break;
202                         }
203
204                         BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
205                         if (ret != 0)
206                                 goto err;
207                         if (LF_ISSET(SR_BOTH))
208                                 goto get_prev;
209                         return (0);
210                 case P_IBTREE:
211                         if (LF_ISSET(SR_MAX)) {
212                                 indx = NUM_ENT(h);
213                                 bi = GET_BINTERNAL(dbp, h, indx - 1);
214                         } else for (indx = 0, top = NUM_ENT(h);;) {
215                                 bi = GET_BINTERNAL(dbp, h, indx);
216                                 if (++indx == top || total + bi->nrecs >= recno)
217                                         break;
218                                 total += bi->nrecs;
219                         }
220                         pg = bi->pgno;
221                         break;
222                 case P_LRECNO:
223                         if (LF_ISSET(SR_MAX))
224                                 recno = NUM_ENT(h);
225                         else
226                                 recno -= total;
227
228                         /* Correct from 1-based to 0-based for a page offset. */
229                         --recno;
230 enter:                  BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
231                         if (ret != 0)
232                                 goto err;
233                         if (LF_ISSET(SR_BOTH)) {
234 get_prev:                       DB_ASSERT(env, LF_ISSET(SR_NEXT));
235                                 /*
236                                  * We have a NEXT tree, now add the sub tree
237                                  * that points gets to the previous page.
238                                  */
239                                 cp->csp++;
240                                 indx = cp->sp->indx - 1;
241                                 h = cp->sp->page;
242                                 if (TYPE(h) == P_IRECNO) {
243                                         ri = GET_RINTERNAL(dbp, h, indx);
244                                         pg = ri->pgno;
245                                 } else {
246                                         DB_ASSERT(env, TYPE(h) == P_IBTREE);
247                                         bi = GET_BINTERNAL(dbp, h, indx);
248                                         pg = bi->pgno;
249                                 }
250                                 LF_CLR(SR_NEXT | SR_BOTH);
251                                 LF_SET(SR_MAX);
252                                 stack = 1;
253                                 h = NULL;
254                                 goto lock_next;
255                         }
256                         return (0);
257                 case P_IRECNO:
258                         if (LF_ISSET(SR_MAX)) {
259                                 indx = NUM_ENT(h);
260                                 ri = GET_RINTERNAL(dbp, h, indx - 1);
261                         } else for (indx = 0, top = NUM_ENT(h);;) {
262                                 ri = GET_RINTERNAL(dbp, h, indx);
263                                 if (++indx == top || total + ri->nrecs >= recno)
264                                         break;
265                                 total += ri->nrecs;
266                         }
267                         pg = ri->pgno;
268                         break;
269                 default:
270                         return (__db_pgfmt(env, h->pgno));
271                 }
272                 --indx;
273
274                 /* Return if this is the lowest page wanted. */
275                 if (stop == LEVEL(h)) {
276                         BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
277                         if (ret != 0)
278                                 goto err;
279                         return (0);
280                 }
281                 if (stack) {
282                         BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
283                         if (ret != 0)
284                                 goto err;
285                         h = NULL;
286
287                         lock_mode = DB_LOCK_WRITE;
288                         get_mode = DB_MPOOL_DIRTY;
289                         if ((ret =
290                             __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
291                                 goto err;
292                 } else if (LF_ISSET(SR_NEXT)) {
293                         /*
294                          * For RECNO if we are doing a NEXT search the
295                          * search recno is the one we are looking for
296                          * but we want to keep the stack from the spanning
297                          * node on down.  We only know we have the spanning
298                          * node when its child's index is 0, so save
299                          * each node and discard the tree when we find out
300                          * its not needed.
301                          */
302                         if (indx != 0 && cp->sp->page != NULL) {
303                                 BT_STK_POP(cp);
304                                 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
305                                         goto err;
306                         }
307
308                         BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
309                         h = NULL;
310                         if (ret != 0)
311                                 goto err;
312 lock_next:              if ((ret =
313                             __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
314                                 goto err;
315                 } else {
316                         /*
317                          * Decide if we want to return a pointer to the next
318                          * page in the stack.  If we do, write lock it and
319                          * never unlock it.
320                          */
321                         if ((LF_ISSET(SR_PARENT) &&
322                             (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
323                             (LEVEL(h) - 1) == LEAFLEVEL)
324                                 stack = 1;
325
326                         if ((ret = __memp_fput(mpf,
327                             dbc->thread_info, h, dbc->priority)) != 0)
328                                 goto err;
329                         h = NULL;
330
331                         lock_mode = stack &&
332                             LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
333                         if (lock_mode == DB_LOCK_WRITE)
334                                 get_mode = DB_MPOOL_DIRTY;
335                         if ((ret = __db_lget(dbc,
336                             LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
337                                 /*
338                                  * If we fail, discard the lock we held.  This
339                                  * is OK because this only happens when we are
340                                  * descending the tree holding read-locks.
341                                  */
342                                 (void)__LPUT(dbc, lock);
343                                 goto err;
344                         }
345                 }
346
347                 if ((ret = __memp_fget(mpf, &pg,
348                      dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
349                         goto err;
350         }
351         /* NOTREACHED */
352
353 err:    if (h != NULL && (t_ret = __memp_fput(mpf,
354             dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
355                 ret = t_ret;
356
357         BT_STK_POP(cp);
358         (void)__bam_stkrel(dbc, 0);
359
360         return (ret);
361 }
362
363 /*
364  * __bam_adjust --
365  *      Adjust the tree after adding or deleting a record.
366  *
367  * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
368  */
369 int
370 __bam_adjust(dbc, adjust)
371         DBC *dbc;
372         int32_t adjust;
373 {
374         BTREE_CURSOR *cp;
375         DB *dbp;
376         DB_MPOOLFILE *mpf;
377         EPG *epg;
378         PAGE *h;
379         db_pgno_t root_pgno;
380         int ret;
381
382         dbp = dbc->dbp;
383         mpf = dbp->mpf;
384         cp = (BTREE_CURSOR *)dbc->internal;
385         root_pgno = cp->root;
386
387         /* Update the record counts for the tree. */
388         for (epg = cp->sp; epg <= cp->csp; ++epg) {
389                 h = epg->page;
390                 if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
391                         ret = __memp_dirty(mpf, &h,
392                             dbc->thread_info, dbc->txn, dbc->priority, 0);
393                         epg->page = h;
394                         if (ret != 0)
395                                 return (ret);
396                         if (DBC_LOGGING(dbc)) {
397                                 if ((ret = __bam_cadjust_log(dbp, dbc->txn,
398                                     &LSN(h), 0, PGNO(h), &LSN(h),
399                                     (u_int32_t)epg->indx, adjust,
400                                     PGNO(h) == root_pgno ?
401                                     CAD_UPDATEROOT : 0)) != 0)
402                                         return (ret);
403                         } else
404                                 LSN_NOT_LOGGED(LSN(h));
405
406                         if (TYPE(h) == P_IBTREE)
407                                 GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
408                                     adjust;
409                         else
410                                 GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
411                                     adjust;
412
413                         if (PGNO(h) == root_pgno)
414                                 RE_NREC_ADJ(h, adjust);
415                 }
416         }
417         return (0);
418 }
419
420 /*
421  * __bam_nrecs --
422  *      Return the number of records in the tree.
423  *
424  * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
425  */
426 int
427 __bam_nrecs(dbc, rep)
428         DBC *dbc;
429         db_recno_t *rep;
430 {
431         DB *dbp;
432         DB_LOCK lock;
433         DB_MPOOLFILE *mpf;
434         PAGE *h;
435         db_pgno_t pgno;
436         int ret, t_ret;
437
438         dbp = dbc->dbp;
439         mpf = dbp->mpf;
440
441         pgno = dbc->internal->root;
442         if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
443                 return (ret);
444         if ((ret = __memp_fget(mpf, &pgno,
445              dbc->thread_info, dbc->txn, 0, &h)) != 0)
446                 return (ret);
447
448         *rep = RE_NREC(h);
449
450         ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
451         if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
452                 ret = t_ret;
453
454         return (ret);
455 }
456
457 /*
458  * __bam_total --
459  *      Return the number of records below a page.
460  *
461  * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
462  */
463 db_recno_t
464 __bam_total(dbp, h)
465         DB *dbp;
466         PAGE *h;
467 {
468         db_recno_t nrecs;
469         db_indx_t indx, top;
470
471         nrecs = 0;
472         top = NUM_ENT(h);
473
474         switch (TYPE(h)) {
475         case P_LBTREE:
476                 /* Check for logically deleted records. */
477                 for (indx = 0; indx < top; indx += P_INDX)
478                         if (!B_DISSET(
479                             GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
480                                 ++nrecs;
481                 break;
482         case P_LDUP:
483                 /* Check for logically deleted records. */
484                 for (indx = 0; indx < top; indx += O_INDX)
485                         if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
486                                 ++nrecs;
487                 break;
488         case P_IBTREE:
489                 for (indx = 0; indx < top; indx += O_INDX)
490                         nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
491                 break;
492         case P_LRECNO:
493                 nrecs = NUM_ENT(h);
494                 break;
495         case P_IRECNO:
496                 for (indx = 0; indx < top; indx += O_INDX)
497                         nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
498                 break;
499         }
500
501         return (nrecs);
502 }