Remove definition of builtin function
[platform/upstream/db4.git] / btree / bt_verify.c
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1999-2009 Oracle.  All rights reserved.
5  *
6  * $Id$
7  */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_verify.h"
14 #include "dbinc/btree.h"
15 #include "dbinc/lock.h"
16 #include "dbinc/mp.h"
17
18 static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
19     PAGE *, u_int32_t, int, DBT *, int *));
20 static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
21     db_indx_t *, u_int32_t));
22 static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
23     BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
24     u_int32_t));
25 static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
26     db_indx_t *, u_int32_t));
27
28 /*
29  * __bam_vrfy_meta --
30  *      Verify the btree-specific part of a metadata page.
31  *
32  * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
33  * PUBLIC:     db_pgno_t, u_int32_t));
34  */
35 int
36 __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
37         DB *dbp;
38         VRFY_DBINFO *vdp;
39         BTMETA *meta;
40         db_pgno_t pgno;
41         u_int32_t flags;
42 {
43         ENV *env;
44         VRFY_PAGEINFO *pip;
45         int isbad, t_ret, ret;
46         db_indx_t ovflsize;
47
48         env = dbp->env;
49         isbad = 0;
50
51         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
52                 return (ret);
53
54         /*
55          * If VRFY_INCOMPLETE is not set, then we didn't come through
56          * __db_vrfy_pagezero and didn't incompletely
57          * check this page--we haven't checked it at all.
58          * Thus we need to call __db_vrfy_meta and check the common fields.
59          *
60          * If VRFY_INCOMPLETE is set, we've already done all the same work
61          * in __db_vrfy_pagezero, so skip the check.
62          */
63         if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
64             (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
65                 if (ret == DB_VERIFY_BAD)
66                         isbad = 1;
67                 else
68                         goto err;
69         }
70
71         /* bt_minkey:  must be >= 2; must produce sensible ovflsize */
72
73         /* avoid division by zero */
74         ovflsize = meta->minkey > 0 ?
75             B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
76
77         if (meta->minkey < 2 ||
78             ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
79                 pip->bt_minkey = 0;
80                 isbad = 1;
81                 EPRINT((env,
82             "Page %lu: nonsensical bt_minkey value %lu on metadata page",
83                     (u_long)pgno, (u_long)meta->minkey));
84         } else
85                 pip->bt_minkey = meta->minkey;
86
87         /* re_len: no constraints on this (may be zero or huge--we make rope) */
88         pip->re_pad = meta->re_pad;
89         pip->re_len = meta->re_len;
90
91         /*
92          * The root must not be current page or 0 and it must be within
93          * database.  If this metadata page is the master meta data page
94          * of the file, then the root page had better be page 1.
95          */
96         pip->root = 0;
97         if (meta->root == PGNO_INVALID ||
98             meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
99             (pgno == PGNO_BASE_MD && meta->root != 1)) {
100                 isbad = 1;
101                 EPRINT((env,
102                     "Page %lu: nonsensical root page %lu on metadata page",
103                     (u_long)pgno, (u_long)meta->root));
104         } else
105                 pip->root = meta->root;
106
107         /* Flags. */
108         if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
109                 F_SET(pip, VRFY_IS_RRECNO);
110
111         if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
112                 /*
113                  * If this is a master db meta page, it had better not have
114                  * duplicates.
115                  */
116                 if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
117                         isbad = 1;
118                         EPRINT((env,
119 "Page %lu: Btree metadata page has both duplicates and multiple databases",
120                             (u_long)pgno));
121                 }
122                 F_SET(pip, VRFY_HAS_SUBDBS);
123         }
124
125         if (F_ISSET(&meta->dbmeta, BTM_DUP))
126                 F_SET(pip, VRFY_HAS_DUPS);
127         if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
128                 F_SET(pip, VRFY_HAS_DUPSORT);
129         if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
130                 F_SET(pip, VRFY_HAS_RECNUMS);
131         if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
132                 EPRINT((env,
133     "Page %lu: Btree metadata page illegally has both recnums and dups",
134                     (u_long)pgno));
135                 isbad = 1;
136         }
137
138         if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
139                 F_SET(pip, VRFY_IS_RECNO);
140                 dbp->type = DB_RECNO;
141         } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
142                 isbad = 1;
143                 EPRINT((env,
144     "Page %lu: metadata page has renumber flag set but is not recno",
145                     (u_long)pgno));
146         }
147
148 #ifdef HAVE_COMPRESSION
149         if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) {
150                 F_SET(pip, VRFY_HAS_COMPRESS);
151                 if (!DB_IS_COMPRESSED(dbp)) {
152                         ((BTREE *)dbp->bt_internal)->bt_compress =
153                             __bam_defcompress;
154                         ((BTREE *)dbp->bt_internal)->bt_decompress =
155                             __bam_defdecompress;
156                 }
157                 /*
158                  * Copy dup_compare to compress_dup_compare, and use the
159                  * compression duplicate compare.
160                  */
161                 if (F_ISSET(pip, VRFY_HAS_DUPSORT)) {
162                         if (dbp->dup_compare == NULL)
163                                 dbp->dup_compare = __bam_defcmp;
164                         if (((BTREE *)dbp->bt_internal)->compress_dup_compare
165                                 == NULL) {
166                                 ((BTREE *)dbp->bt_internal)->
167                                         compress_dup_compare = dbp->dup_compare;
168                                 dbp->dup_compare = __bam_compress_dupcmp;
169                         }
170                 }
171         }
172
173         if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) {
174                 EPRINT((env,
175     "Page %lu: Btree metadata page illegally has both recnums and compression",
176                     (u_long)pgno));
177                 isbad = 1;
178         }
179         if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) &&
180             F_ISSET(pip, VRFY_HAS_COMPRESS)) {
181                 EPRINT((env,
182     "Page %lu: Btree metadata page illegally has both unsorted duplicates%s",
183                     (u_long)pgno,
184                     " and compression"));
185                 isbad = 1;
186         }
187 #endif
188
189         if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
190                 EPRINT((env,
191                     "Page %lu: recno metadata page specifies duplicates",
192                     (u_long)pgno));
193                 isbad = 1;
194         }
195
196         if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
197                 F_SET(pip, VRFY_IS_FIXEDLEN);
198         else if (pip->re_len > 0) {
199                 /*
200                  * It's wrong to have an re_len if it's not a fixed-length
201                  * database
202                  */
203                 isbad = 1;
204                 EPRINT((env,
205                     "Page %lu: re_len of %lu in non-fixed-length database",
206                     (u_long)pgno, (u_long)pip->re_len));
207         }
208
209         /*
210          * We do not check that the rest of the page is 0, because it may
211          * not be and may still be correct.
212          */
213
214 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
215                 ret = t_ret;
216         if (LF_ISSET(DB_SALVAGE) &&
217            (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
218                 ret = t_ret;
219         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
220 }
221
222 /*
223  * __ram_vrfy_leaf --
224  *      Verify a recno leaf page.
225  *
226  * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
227  * PUBLIC:     u_int32_t));
228  */
229 int
230 __ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
231         DB *dbp;
232         VRFY_DBINFO *vdp;
233         PAGE *h;
234         db_pgno_t pgno;
235         u_int32_t flags;
236 {
237         BKEYDATA *bk;
238         ENV *env;
239         VRFY_PAGEINFO *pip;
240         db_indx_t i;
241         int ret, t_ret, isbad;
242         u_int32_t re_len_guess, len;
243
244         env = dbp->env;
245         isbad = 0;
246
247         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
248                 return (ret);
249
250         if (TYPE(h) != P_LRECNO) {
251                 ret = __db_unknown_path(env, "__ram_vrfy_leaf");
252                 goto err;
253         }
254
255         /*
256          * Verify (and, if relevant, save off) page fields common to
257          * all PAGEs.
258          */
259         if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
260                 if (ret == DB_VERIFY_BAD)
261                         isbad = 1;
262                 else
263                         goto err;
264         }
265
266         /*
267          * Verify inp[].  Return immediately if it returns DB_VERIFY_BAD;
268          * further checks are dangerous.
269          */
270         if ((ret = __bam_vrfy_inp(dbp,
271             vdp, h, pgno, &pip->entries, flags)) != 0)
272                 goto err;
273
274         if (F_ISSET(pip, VRFY_HAS_DUPS)) {
275                 EPRINT((env,
276                     "Page %lu: Recno database has dups", (u_long)pgno));
277                 ret = DB_VERIFY_BAD;
278                 goto err;
279         }
280
281         /*
282          * Walk through inp and see if the lengths of all the records are the
283          * same--if so, this may be a fixed-length database, and we want to
284          * save off this value.  We know inp to be safe if we've gotten this
285          * far.
286          */
287         re_len_guess = 0;
288         for (i = 0; i < NUM_ENT(h); i++) {
289                 bk = GET_BKEYDATA(dbp, h, i);
290                 /* KEYEMPTY.  Go on. */
291                 if (B_DISSET(bk->type))
292                         continue;
293                 if (bk->type == B_OVERFLOW)
294                         len = ((BOVERFLOW *)bk)->tlen;
295                 else if (bk->type == B_KEYDATA)
296                         len = bk->len;
297                 else {
298                         isbad = 1;
299                         EPRINT((env,
300                             "Page %lu: nonsensical type for item %lu",
301                             (u_long)pgno, (u_long)i));
302                         continue;
303                 }
304                 if (re_len_guess == 0)
305                         re_len_guess = len;
306
307                 /*
308                  * Is this item's len the same as the last one's?  If not,
309                  * reset to 0 and break--we don't have a single re_len.
310                  * Otherwise, go on to the next item.
311                  */
312                 if (re_len_guess != len) {
313                         re_len_guess = 0;
314                         break;
315                 }
316         }
317         pip->re_len = re_len_guess;
318
319         /* Save off record count. */
320         pip->rec_cnt = NUM_ENT(h);
321
322 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
323                 ret = t_ret;
324         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
325 }
326
327 /*
328  * __bam_vrfy --
329  *      Verify a btree leaf or internal page.
330  *
331  * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
332  * PUBLIC:     u_int32_t));
333  */
334 int
335 __bam_vrfy(dbp, vdp, h, pgno, flags)
336         DB *dbp;
337         VRFY_DBINFO *vdp;
338         PAGE *h;
339         db_pgno_t pgno;
340         u_int32_t flags;
341 {
342         ENV *env;
343         VRFY_PAGEINFO *pip;
344         int ret, t_ret, isbad;
345
346         env = dbp->env;
347         isbad = 0;
348
349         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
350                 return (ret);
351
352         switch (TYPE(h)) {
353         case P_IBTREE:
354         case P_IRECNO:
355         case P_LBTREE:
356         case P_LDUP:
357                 break;
358         default:
359                 ret = __db_unknown_path(env, "__bam_vrfy");
360                 goto err;
361         }
362
363         /*
364          * Verify (and, if relevant, save off) page fields common to
365          * all PAGEs.
366          */
367         if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
368                 if (ret == DB_VERIFY_BAD)
369                         isbad = 1;
370                 else
371                         goto err;
372         }
373
374         /*
375          * The record count is, on internal pages, stored in an overloaded
376          * next_pgno field.  Save it off;  we'll verify it when we check
377          * overall database structure.  We could overload the field
378          * in VRFY_PAGEINFO, too, but this seems gross, and space
379          * is not at such a premium.
380          */
381         pip->rec_cnt = RE_NREC(h);
382
383         /*
384          * Verify inp[].
385          */
386         if (TYPE(h) == P_IRECNO) {
387                 if ((ret = __ram_vrfy_inp(dbp,
388                     vdp, h, pgno, &pip->entries, flags)) != 0)
389                         goto err;
390         } else if ((ret = __bam_vrfy_inp(dbp,
391             vdp, h, pgno, &pip->entries, flags)) != 0) {
392                 if (ret == DB_VERIFY_BAD)
393                         isbad = 1;
394                 else
395                         goto err;
396                 EPRINT((env,
397                     "Page %lu: item order check unsafe: skipping",
398                     (u_long)pgno));
399         } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
400             __bam_vrfy_itemorder(dbp,
401             vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
402                 /*
403                  * We know that the elements of inp are reasonable.
404                  *
405                  * Check that elements fall in the proper order.
406                  */
407                 if (ret == DB_VERIFY_BAD)
408                         isbad = 1;
409                 else
410                         goto err;
411         }
412
413 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
414                 ret = t_ret;
415         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
416 }
417
418 /*
419  * __ram_vrfy_inp --
420  *      Verify that all entries in a P_IRECNO inp[] array are reasonable,
421  *      and count them.  Note that P_LRECNO uses __bam_vrfy_inp;
422  *      P_IRECNOs are a special, and simpler, case, since they have
423  *      RINTERNALs rather than BKEYDATA/BINTERNALs.
424  */
425 static int
426 __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
427         DB *dbp;
428         VRFY_DBINFO *vdp;
429         PAGE *h;
430         db_pgno_t pgno;
431         db_indx_t *nentriesp;
432         u_int32_t flags;
433 {
434         ENV *env;
435         RINTERNAL *ri;
436         VRFY_CHILDINFO child;
437         VRFY_PAGEINFO *pip;
438         int ret, t_ret, isbad;
439         u_int32_t himark, i, offset, nentries;
440         db_indx_t *inp;
441         u_int8_t *pagelayout, *p;
442
443         env = dbp->env;
444         isbad = 0;
445         memset(&child, 0, sizeof(VRFY_CHILDINFO));
446         nentries = 0;
447         pagelayout = NULL;
448
449         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
450                 return (ret);
451
452         if (TYPE(h) != P_IRECNO) {
453                 ret = __db_unknown_path(env, "__ram_vrfy_inp");
454                 goto err;
455         }
456
457         himark = dbp->pgsize;
458         if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
459                 goto err;
460         memset(pagelayout, 0, dbp->pgsize);
461         inp = P_INP(dbp, h);
462         for (i = 0; i < NUM_ENT(h); i++) {
463                 if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
464                         EPRINT((env,
465                             "Page %lu: entries listing %lu overlaps data",
466                             (u_long)pgno, (u_long)i));
467                         ret = DB_VERIFY_BAD;
468                         goto err;
469                 }
470                 offset = inp[i];
471                 /*
472                  * Check that the item offset is reasonable:  it points
473                  * somewhere after the inp array and before the end of the
474                  * page.
475                  */
476                 if (offset <= (u_int32_t)((u_int8_t *)inp + i -
477                     (u_int8_t *)h) ||
478                     offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
479                         isbad = 1;
480                         EPRINT((env,
481                             "Page %lu: bad offset %lu at index %lu",
482                             (u_long)pgno, (u_long)offset, (u_long)i));
483                         continue;
484                 }
485
486                 /* Update the high-water mark (what HOFFSET should be) */
487                 if (offset < himark)
488                         himark = offset;
489
490                 nentries++;
491
492                 /* Make sure this RINTERNAL is not multiply referenced. */
493                 ri = GET_RINTERNAL(dbp, h, i);
494                 if (pagelayout[offset] == 0) {
495                         pagelayout[offset] = 1;
496                         child.pgno = ri->pgno;
497                         child.type = V_RECNO;
498                         child.nrecs = ri->nrecs;
499                         if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
500                                 goto err;
501                 } else {
502                         EPRINT((env,
503                 "Page %lu: RINTERNAL structure at offset %lu referenced twice",
504                             (u_long)pgno, (u_long)offset));
505                         isbad = 1;
506                 }
507         }
508
509         for (p = pagelayout + himark;
510             p < pagelayout + dbp->pgsize;
511             p += RINTERNAL_SIZE)
512                 if (*p != 1) {
513                         EPRINT((env,
514                             "Page %lu: gap between items at offset %lu",
515                             (u_long)pgno, (u_long)(p - pagelayout)));
516                         isbad = 1;
517                 }
518
519         if ((db_indx_t)himark != HOFFSET(h)) {
520                 EPRINT((env,
521                     "Page %lu: bad HOFFSET %lu, appears to be %lu",
522                     (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark));
523                 isbad = 1;
524         }
525
526         *nentriesp = nentries;
527
528 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
529                 ret = t_ret;
530         if (pagelayout != NULL)
531                 __os_free(env, pagelayout);
532         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
533 }
534
535 typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
536
537 /*
538  * __bam_vrfy_inp --
539  *      Verify that all entries in inp[] array are reasonable;
540  *      count them.
541  */
542 static int
543 __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
544         DB *dbp;
545         VRFY_DBINFO *vdp;
546         PAGE *h;
547         db_pgno_t pgno;
548         db_indx_t *nentriesp;
549         u_int32_t flags;
550 {
551         BKEYDATA *bk;
552         BOVERFLOW *bo;
553         ENV *env;
554         VRFY_CHILDINFO child;
555         VRFY_ITEM *pagelayout;
556         VRFY_PAGEINFO *pip;
557         u_int32_t himark, offset;               /*
558                                                  * These would be db_indx_ts
559                                                  * but for alignment.
560                                                  */
561         u_int32_t i, endoff, nentries;
562         int isbad, initem, isdupitem, ret, t_ret;
563
564         env = dbp->env;
565         isbad = isdupitem = 0;
566         nentries = 0;
567         memset(&child, 0, sizeof(VRFY_CHILDINFO));
568         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
569                 return (ret);
570
571         switch (TYPE(h)) {
572         case P_IBTREE:
573         case P_LBTREE:
574         case P_LDUP:
575         case P_LRECNO:
576                 break;
577         default:
578                 /*
579                  * In the salvager, we might call this from a page which
580                  * we merely suspect is a btree page.  Otherwise, it
581                  * shouldn't get called--if it is, that's a verifier bug.
582                  */
583                 if (LF_ISSET(DB_SALVAGE))
584                         break;
585                 ret = __db_unknown_path(env, "__bam_vrfy_inp");
586                 goto err;
587         }
588
589         /*
590          * Loop through inp[], the array of items, until we either
591          * run out of entries or collide with the data.  Keep track
592          * of h_offset in himark.
593          *
594          * For each element in inp[i], make sure it references a region
595          * that starts after the end of the inp array (as defined by
596          * NUM_ENT(h)), ends before the beginning of the page, doesn't
597          * overlap any other regions, and doesn't have a gap between
598          * it and the region immediately after it.
599          */
600         himark = dbp->pgsize;
601         if ((ret = __os_calloc(
602             env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
603                 goto err;
604         for (i = 0; i < NUM_ENT(h); i++) {
605                 switch (ret = __db_vrfy_inpitem(dbp,
606                     h, pgno, i, 1, flags, &himark, &offset)) {
607                 case 0:
608                         break;
609                 case DB_VERIFY_BAD:
610                         isbad = 1;
611                         continue;
612                 case DB_VERIFY_FATAL:
613                         isbad = 1;
614                         goto err;
615                 default:
616                         DB_ASSERT(env, ret != 0);
617                         break;
618                 }
619
620                 /*
621                  * We now have a plausible beginning for the item, and we know
622                  * its length is safe.
623                  *
624                  * Mark the beginning and end in pagelayout so we can make sure
625                  * items have no overlaps or gaps.
626                  */
627                 bk = GET_BKEYDATA(dbp, h, i);
628                 if (pagelayout[offset] == VRFY_ITEM_NOTSET)
629                         pagelayout[offset] = VRFY_ITEM_BEGIN;
630                 else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
631                         /*
632                          * Having two inp entries that point at the same patch
633                          * of page is legal if and only if the page is
634                          * a btree leaf and they're onpage duplicate keys--
635                          * that is, if (i % P_INDX) == 0.
636                          */
637                         if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
638                                 /* Flag for later. */
639                                 F_SET(pip, VRFY_HAS_DUPS);
640
641                                 /* Bump up nentries so we don't undercount. */
642                                 nentries++;
643
644                                 /*
645                                  * We'll check to make sure the end is
646                                  * equal, too.
647                                  */
648                                 isdupitem = 1;
649                         } else {
650                                 isbad = 1;
651                                 EPRINT((env, "Page %lu: duplicated item %lu",
652                                     (u_long)pgno, (u_long)i));
653                         }
654                 }
655
656                 /*
657                  * Mark the end.  Its location varies with the page type
658                  * and the item type.
659                  *
660                  * If the end already has a sign other than 0, do nothing--
661                  * it's an overlap that we'll catch later.
662                  */
663                 switch (B_TYPE(bk->type)) {
664                 case B_KEYDATA:
665                         if (TYPE(h) == P_IBTREE)
666                                 /* It's a BINTERNAL. */
667                                 endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
668                         else
669                                 endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
670                         break;
671                 case B_DUPLICATE:
672                         /*
673                          * Flag that we have dups; we'll check whether
674                          * that's okay during the structure check.
675                          */
676                         F_SET(pip, VRFY_HAS_DUPS);
677                         /* FALLTHROUGH */
678                 case B_OVERFLOW:
679                         /*
680                          * Overflow entries on internal pages are stored
681                          * as the _data_ of a BINTERNAL;  overflow entries
682                          * on leaf pages are stored as the entire entry.
683                          */
684                         endoff = offset +
685                             ((TYPE(h) == P_IBTREE) ?
686                             BINTERNAL_SIZE(BOVERFLOW_SIZE) :
687                             BOVERFLOW_SIZE) - 1;
688                         break;
689                 default:
690                         /*
691                          * We'll complain later;  for now, just mark
692                          * a minimum.
693                          */
694                         endoff = offset + BKEYDATA_SIZE(0) - 1;
695                         break;
696                 }
697
698                 /*
699                  * If this is an onpage duplicate key we've seen before,
700                  * the end had better coincide too.
701                  */
702                 if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
703                         EPRINT((env, "Page %lu: duplicated item %lu",
704                             (u_long)pgno, (u_long)i));
705                         isbad = 1;
706                 } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
707                         pagelayout[endoff] = VRFY_ITEM_END;
708                 isdupitem = 0;
709
710                 /*
711                  * There should be no deleted items in a quiescent tree,
712                  * except in recno.
713                  */
714                 if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
715                         isbad = 1;
716                         EPRINT((env, "Page %lu: item %lu marked deleted",
717                             (u_long)pgno, (u_long)i));
718                 }
719
720                 /*
721                  * Check the type and such of bk--make sure it's reasonable
722                  * for the pagetype.
723                  */
724                 switch (B_TYPE(bk->type)) {
725                 case B_KEYDATA:
726                         /*
727                          * This is a normal, non-overflow BKEYDATA or BINTERNAL.
728                          * The only thing to check is the len, and that's
729                          * already been done.
730                          */
731                         break;
732                 case B_DUPLICATE:
733                         if (TYPE(h) == P_IBTREE) {
734                                 isbad = 1;
735                                 EPRINT((env,
736     "Page %lu: duplicate page referenced by internal btree page at item %lu",
737                                     (u_long)pgno, (u_long)i));
738                                 break;
739                         } else if (TYPE(h) == P_LRECNO) {
740                                 isbad = 1;
741                                 EPRINT((env,
742         "Page %lu: duplicate page referenced by recno page at item %lu",
743                                     (u_long)pgno, (u_long)i));
744                                 break;
745                         }
746                         /* FALLTHROUGH */
747                 case B_OVERFLOW:
748                         bo = (TYPE(h) == P_IBTREE) ?
749                             (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
750                             (BOVERFLOW *)bk;
751
752                         if (B_TYPE(bk->type) == B_OVERFLOW)
753                                 /* Make sure tlen is reasonable. */
754                                 if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
755                                         isbad = 1;
756                                         EPRINT((env,
757                                 "Page %lu: impossible tlen %lu, item %lu",
758                                             (u_long)pgno,
759                                             (u_long)bo->tlen, (u_long)i));
760                                         /* Don't save as a child. */
761                                         break;
762                                 }
763
764                         if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
765                             bo->pgno == PGNO_INVALID) {
766                                 isbad = 1;
767                                 EPRINT((env,
768                             "Page %lu: offpage item %lu has bad pgno %lu",
769                                     (u_long)pgno, (u_long)i, (u_long)bo->pgno));
770                                 /* Don't save as a child. */
771                                 break;
772                         }
773
774                         child.pgno = bo->pgno;
775                         child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
776                             V_OVERFLOW : V_DUPLICATE);
777                         child.tlen = bo->tlen;
778                         if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
779                                 goto err;
780                         break;
781                 default:
782                         isbad = 1;
783                         EPRINT((env, "Page %lu: item %lu of invalid type %lu",
784                             (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type)));
785                         break;
786                 }
787         }
788
789         /*
790          * Now, loop through and make sure the items are contiguous and
791          * non-overlapping.
792          */
793         initem = 0;
794         for (i = himark; i < dbp->pgsize; i++)
795                 if (initem == 0)
796                         switch (pagelayout[i]) {
797                         case VRFY_ITEM_NOTSET:
798                                 /* May be just for alignment. */
799                                 if (i != DB_ALIGN(i, sizeof(u_int32_t)))
800                                         continue;
801
802                                 isbad = 1;
803                                 EPRINT((env,
804                                     "Page %lu: gap between items at offset %lu",
805                                     (u_long)pgno, (u_long)i));
806                                 /* Find the end of the gap */
807                                 for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
808                                     (size_t)(i + 1) < dbp->pgsize; i++)
809                                         ;
810                                 break;
811                         case VRFY_ITEM_BEGIN:
812                                 /* We've found an item. Check its alignment. */
813                                 if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
814                                         isbad = 1;
815                                         EPRINT((env,
816                                             "Page %lu: offset %lu unaligned",
817                                             (u_long)pgno, (u_long)i));
818                                 }
819                                 initem = 1;
820                                 nentries++;
821                                 break;
822                         case VRFY_ITEM_END:
823                                 /*
824                                  * We've hit the end of an item even though
825                                  * we don't think we're in one;  must
826                                  * be an overlap.
827                                  */
828                                 isbad = 1;
829                                 EPRINT((env,
830                                     "Page %lu: overlapping items at offset %lu",
831                                     (u_long)pgno, (u_long)i));
832                                 break;
833                         }
834                 else
835                         switch (pagelayout[i]) {
836                         case VRFY_ITEM_NOTSET:
837                                 /* In the middle of an item somewhere. Okay. */
838                                 break;
839                         case VRFY_ITEM_END:
840                                 /* End of an item; switch to out-of-item mode.*/
841                                 initem = 0;
842                                 break;
843                         case VRFY_ITEM_BEGIN:
844                                 /*
845                                  * Hit a second item beginning without an
846                                  * end.  Overlap.
847                                  */
848                                 isbad = 1;
849                                 EPRINT((env,
850                                     "Page %lu: overlapping items at offset %lu",
851                                     (u_long)pgno, (u_long)i));
852                                 break;
853                         }
854
855         __os_free(env, pagelayout);
856
857         /* Verify HOFFSET. */
858         if ((db_indx_t)himark != HOFFSET(h)) {
859                 EPRINT((env, "Page %lu: bad HOFFSET %lu, appears to be %lu",
860                     (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark));
861                 isbad = 1;
862         }
863
864 err:    if (nentriesp != NULL)
865                 *nentriesp = nentries;
866
867         if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
868                 ret = t_ret;
869
870         return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
871 }
872
873 /*
874  * __bam_vrfy_itemorder --
875  *      Make sure the items on a page sort correctly.
876  *
877  *      Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
878  *      reasonable;  be sure that __bam_vrfy_inp has been called first.
879  *
880  *      If ovflok is set, it also assumes that overflow page chains
881  *      hanging off the current page have been sanity-checked, and so we
882  *      can use __bam_cmp to verify their ordering.  If it is not set,
883  *      and we run into an overflow page, carp and return DB_VERIFY_BAD;
884  *      we shouldn't be called if any exist.
885  *
886  * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
887  * PUBLIC:      PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
888  */
889 int
890 __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
891         DB *dbp;
892         VRFY_DBINFO *vdp;
893         DB_THREAD_INFO *ip;
894         PAGE *h;
895         db_pgno_t pgno;
896         u_int32_t nentries;
897         int ovflok, hasdups;
898         u_int32_t flags;
899 {
900         BINTERNAL *bi;
901         BKEYDATA *bk;
902         BOVERFLOW *bo;
903         BTREE *bt;
904         DBC *dbc;
905         DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
906         ENV *env;
907         VRFY_PAGEINFO *pip;
908         db_indx_t i, *inp;
909         int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
910         int (*dupfunc) __P((DB *, const DBT *, const DBT *));
911         int (*func) __P((DB *, const DBT *, const DBT *));
912         void *buf1, *buf2, *tmpbuf;
913
914         /*
915          * We need to work in the ORDERCHKONLY environment where we might
916          * not have a pip, but we also may need to work in contexts where
917          * NUM_ENT isn't safe.
918          */
919         if (vdp != NULL) {
920                 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
921                         return (ret);
922                 nentries = pip->entries;
923         } else
924                 pip = NULL;
925
926         env = dbp->env;
927         ret = isbad = 0;
928         bo = NULL;                      /* Shut up compiler. */
929
930         memset(&dbta, 0, sizeof(DBT));
931         F_SET(&dbta, DB_DBT_REALLOC);
932
933         memset(&dbtb, 0, sizeof(DBT));
934         F_SET(&dbtb, DB_DBT_REALLOC);
935
936         buf1 = buf2 = NULL;
937
938         DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
939
940         dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
941         if (TYPE(h) == P_LDUP)
942                 func = dupfunc;
943         else {
944                 func = __bam_defcmp;
945                 if (dbp->bt_internal != NULL) {
946                         bt = (BTREE *)dbp->bt_internal;
947                         if (bt->bt_compare != NULL)
948                                 func = bt->bt_compare;
949                 }
950         }
951
952         /*
953          * We alternate our use of dbta and dbtb so that we can walk
954          * through the page key-by-key without copying a dbt twice.
955          * p1 is always the dbt for index i - 1, and p2 for index i.
956          * Reset the data pointers in case we are retrying.
957          */
958 retry:  p1 = &dbta;
959         p1->data = NULL;
960         p2 = &dbtb;
961         p2->data = NULL;
962
963         /*
964          * Loop through the entries.  nentries ought to contain the
965          * actual count, and so is a safe way to terminate the loop;  whether
966          * we inc. by one or two depends on whether we're a leaf page--
967          * on a leaf page, we care only about keys.  On internal pages
968          * and LDUP pages, we want to check the order of all entries.
969          *
970          * Note that on IBTREE pages or the index page of a partitioned
971          * database, we start with item 1, since item 0 doesn't get looked
972          * at by __bam_cmp.
973          */
974         inp = P_INP(dbp, h);
975         adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
976         for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0;
977             i < nentries; i += adj) {
978                 /*
979                  * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
980                  */
981                 tmp = p1;
982                 p1 = p2;
983                 p2 = tmp;
984                 tmpbuf = buf1;
985                 buf1 = buf2;
986                 buf2 = tmpbuf;
987
988                 /*
989                  * Get key i into p2.
990                  */
991                 switch (TYPE(h)) {
992                 case P_IBTREE:
993                         bi = GET_BINTERNAL(dbp, h, i);
994                         if (B_TYPE(bi->type) == B_OVERFLOW) {
995                                 bo = (BOVERFLOW *)(bi->data);
996                                 goto overflow;
997                         } else {
998                                 p2->data = bi->data;
999                                 p2->size = bi->len;
1000                         }
1001
1002                         /*
1003                          * The leftmost key on an internal page must be
1004                          * len 0, since it's just a placeholder and
1005                          * automatically sorts less than all keys.
1006                          *
1007                          * XXX
1008                          * This criterion does not currently hold!
1009                          * See todo list item #1686.  Meanwhile, it's harmless
1010                          * to just not check for it.
1011                          */
1012 #if 0
1013                         if (i == 0 && bi->len != 0) {
1014                                 isbad = 1;
1015                                 EPRINT((env,
1016                 "Page %lu: lowest key on internal page of nonzero length",
1017                                     (u_long)pgno));
1018                         }
1019 #endif
1020                         break;
1021                 case P_LBTREE:
1022                 case P_LDUP:
1023                         bk = GET_BKEYDATA(dbp, h, i);
1024                         if (B_TYPE(bk->type) == B_OVERFLOW) {
1025                                 bo = (BOVERFLOW *)bk;
1026                                 goto overflow;
1027                         } else {
1028                                 p2->data = bk->data;
1029                                 p2->size = bk->len;
1030                         }
1031                         break;
1032                 default:
1033                         /*
1034                          * This means our caller screwed up and sent us
1035                          * an inappropriate page.
1036                          */
1037                         ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
1038                         goto err;
1039                 }
1040
1041                 if (0) {
1042                         /*
1043                          * If ovflok != 1, we can't safely go chasing
1044                          * overflow pages with the normal routines now;
1045                          * they might be unsafe or nonexistent.  Mark this
1046                          * page as incomplete and return.
1047                          *
1048                          * Note that we don't need to worry about freeing
1049                          * buffers, since they can't have been allocated
1050                          * if overflow items are unsafe.
1051                          */
1052 overflow:               if (!ovflok) {
1053                                 F_SET(pip, VRFY_INCOMPLETE);
1054                                 goto err;
1055                         }
1056
1057                         /*
1058                          * Overflow items are safe to chase.  Do so.
1059                          * Fetch the overflow item into p2->data,
1060                          * NULLing it or reallocing it as appropriate.
1061                          *
1062                          * (We set p2->data to buf2 before the call
1063                          * so we're sure to realloc if we can and if p2
1064                          * was just pointing at a non-overflow item.)
1065                          */
1066                         p2->data = buf2;
1067                         if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
1068                             PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
1069                                 goto err;
1070                         if ((ret = __db_goff(dbc,
1071                             p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
1072                                 isbad = 1;
1073                                 EPRINT((env,
1074                             "Page %lu: error %lu in fetching overflow item %lu",
1075                                     (u_long)pgno, (u_long)ret, (u_long)i));
1076                         }
1077                         /* In case it got realloc'ed and thus changed. */
1078                         buf2 = p2->data;
1079                 }
1080
1081                 /* Compare with the last key. */
1082                 if (p1->data != NULL && p2->data != NULL) {
1083                         cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
1084
1085                         /* comparison succeeded */
1086                         if (cmp > 0) {
1087                                 /*
1088                                  * If we are looking at an internal page, we
1089                                  * don't know whether it is part of the main
1090                                  * database or in an off-page-duplicate tree.
1091                                  * If the main comparator fails, retry with
1092                                  * the duplicate comparator.
1093                                  */
1094                                 if (TYPE(h) == P_IBTREE && func != dupfunc) {
1095                                         func = dupfunc;
1096                                         goto retry;
1097                                 }
1098
1099                                 isbad = 1;
1100                                 EPRINT((env,
1101                                     "Page %lu: out-of-order key at entry %lu",
1102                                     (u_long)pgno, (u_long)i));
1103                                 /* proceed */
1104                         } else if (cmp == 0) {
1105                                 if (inp[i] != inp[i - adj]) {
1106                                         /* See above. */
1107                                         if (TYPE(h) == P_IBTREE &&
1108                                             func != dupfunc) {
1109                                                 func = dupfunc;
1110                                                 goto retry;
1111                                         }
1112                                         isbad = 1;
1113                                         EPRINT((env,
1114                                      "Page %lu: non-dup dup key at entry %lu",
1115                                            (u_long)pgno, (u_long)i));
1116                                 }
1117                                 /*
1118                                  * If they compared equally, this
1119                                  * had better be a (sub)database with dups.
1120                                  * Mark it so we can check during the
1121                                  * structure check.
1122                                  */
1123                                 if (pip != NULL)
1124                                         F_SET(pip, VRFY_HAS_DUPS);
1125                                 else if (hasdups == 0) {
1126                                         /* See above. */
1127                                         if (TYPE(h) == P_IBTREE &&
1128                                             func != dupfunc) {
1129                                                 func = dupfunc;
1130                                                 goto retry;
1131                                         }
1132                                         isbad = 1;
1133                                         EPRINT((env,
1134         "Page %lu: database with no duplicates has duplicated keys",
1135                                             (u_long)pgno));
1136                                 }
1137
1138                                 /*
1139                                  * If we're a btree leaf, check to see
1140                                  * if the data items of these on-page dups are
1141                                  * in sorted order.  If not, flag this, so
1142                                  * that we can make sure during the
1143                                  * structure checks that the DUPSORT flag
1144                                  * is unset.
1145                                  *
1146                                  * At this point i points to a duplicate key.
1147                                  * Compare the datum before it (same key)
1148                                  * to the datum after it, i.e. i-1 to i+1.
1149                                  */
1150                                 if (TYPE(h) == P_LBTREE) {
1151                                         /*
1152                                          * Unsafe;  continue and we'll pick
1153                                          * up the bogus nentries later.
1154                                          */
1155                                         if (i + 1 >= (db_indx_t)nentries)
1156                                                 continue;
1157
1158                                         /*
1159                                          * We don't bother with clever memory
1160                                          * management with on-page dups,
1161                                          * as it's only really a big win
1162                                          * in the overflow case, and overflow
1163                                          * dups are probably (?) rare.
1164                                          */
1165                                         if (((ret = __bam_safe_getdata(dbp,
1166                                             ip, h, i - 1, ovflok,
1167                                             &dup_1, &freedup_1)) != 0) ||
1168                                             ((ret = __bam_safe_getdata(dbp,
1169                                             ip, h, i + 1, ovflok,
1170                                             &dup_2, &freedup_2)) != 0))
1171                                                 goto err;
1172
1173                                         /*
1174                                          * If either of the data are NULL,
1175                                          * it's because they're overflows and
1176                                          * it's not safe to chase them now.
1177                                          * Mark an incomplete and return.
1178                                          */
1179                                         if (dup_1.data == NULL ||
1180                                             dup_2.data == NULL) {
1181                                                 DB_ASSERT(env, !ovflok);
1182                                                 F_SET(pip, VRFY_INCOMPLETE);
1183                                                 goto err;
1184                                         }
1185
1186                                         /*
1187                                          * If the dups are out of order,
1188                                          * flag this.  It's not an error
1189                                          * until we do the structure check
1190                                          * and see whether DUPSORT is set.
1191                                          */
1192                                         if (dupfunc(dbp, &dup_1, &dup_2) > 0)
1193                                                 F_SET(pip, VRFY_DUPS_UNSORTED);
1194
1195                                         if (freedup_1)
1196                                                 __os_ufree(env, dup_1.data);
1197                                         if (freedup_2)
1198                                                 __os_ufree(env, dup_2.data);
1199                                 }
1200                         }
1201                 }
1202         }
1203
1204 err:    if (pip != NULL && ((t_ret =
1205             __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
1206                 ret = t_ret;
1207
1208         if (buf1 != NULL)
1209                 __os_ufree(env, buf1);
1210         if (buf2 != NULL)
1211                 __os_ufree(env, buf2);
1212
1213         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
1214 }
1215
1216 /*
1217  * __bam_vrfy_structure --
1218  *      Verify the tree structure of a btree database (including the master
1219  *      database containing subdbs).
1220  *
1221  * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
1222  * PUBLIC:     void *, void *, u_int32_t));
1223  */
1224 int
1225 __bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags)
1226         DB *dbp;
1227         VRFY_DBINFO *vdp;
1228         db_pgno_t meta_pgno;
1229         void *lp, *rp;
1230         u_int32_t flags;
1231 {
1232         DB *pgset;
1233         ENV *env;
1234         VRFY_PAGEINFO *mip, *rip;
1235         db_pgno_t root, p;
1236         int t_ret, ret;
1237         u_int32_t nrecs, level, relen, stflags;
1238
1239         env = dbp->env;
1240         mip = rip = 0;
1241         pgset = vdp->pgset;
1242
1243         if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
1244                 return (ret);
1245
1246         if ((ret = __db_vrfy_pgset_get(pgset,
1247             vdp->thread_info, meta_pgno, (int *)&p)) != 0)
1248                 goto err;
1249         if (p != 0) {
1250                 EPRINT((env,
1251                     "Page %lu: btree metadata page observed twice",
1252                     (u_long)meta_pgno));
1253                 ret = DB_VERIFY_BAD;
1254                 goto err;
1255         }
1256         if ((ret =
1257             __db_vrfy_pgset_inc(pgset, vdp->thread_info, meta_pgno)) != 0)
1258                 goto err;
1259
1260         root = mip->root;
1261
1262         if (root == 0) {
1263                 EPRINT((env,
1264                     "Page %lu: btree metadata page has no root",
1265                     (u_long)meta_pgno));
1266                 ret = DB_VERIFY_BAD;
1267                 goto err;
1268         }
1269
1270         if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
1271                 goto err;
1272
1273         switch (rip->type) {
1274         case P_IBTREE:
1275         case P_LBTREE:
1276                 stflags = flags | DB_ST_TOPLEVEL;
1277                 if (F_ISSET(mip, VRFY_HAS_DUPS))
1278                         stflags |= DB_ST_DUPOK;
1279                 if (F_ISSET(mip, VRFY_HAS_DUPSORT))
1280                         stflags |= DB_ST_DUPSORT;
1281                 if (F_ISSET(mip, VRFY_HAS_RECNUMS))
1282                         stflags |= DB_ST_RECNUM;
1283                 ret = __bam_vrfy_subtree(dbp,
1284                     vdp, root, lp, rp, stflags, NULL, NULL, NULL);
1285                 break;
1286         case P_IRECNO:
1287         case P_LRECNO:
1288                 stflags =
1289                     flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
1290                 if (mip->re_len > 0)
1291                         stflags |= DB_ST_RELEN;
1292                 if ((ret = __bam_vrfy_subtree(dbp, vdp,
1293                     root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
1294                         goto err;
1295                 /*
1296                  * Even if mip->re_len > 0, re_len may come back zero if the
1297                  * tree is empty.  It should be okay to just skip the check in
1298                  * this case, as if there are any non-deleted keys at all,
1299                  * that should never happen.
1300                  */
1301                 if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
1302                         EPRINT((env,
1303                             "Page %lu: recno database has bad re_len %lu",
1304                             (u_long)meta_pgno, (u_long)relen));
1305                         ret = DB_VERIFY_BAD;
1306                         goto err;
1307                 }
1308                 ret = 0;
1309                 break;
1310         case P_LDUP:
1311                 EPRINT((env,
1312                     "Page %lu: duplicate tree referenced from metadata page",
1313                     (u_long)meta_pgno));
1314                 ret = DB_VERIFY_BAD;
1315                 break;
1316         default:
1317                 EPRINT((env,
1318             "Page %lu: btree root of incorrect type %lu on metadata page",
1319                     (u_long)meta_pgno, (u_long)rip->type));
1320                 ret = DB_VERIFY_BAD;
1321                 break;
1322         }
1323
1324 err:    if (mip != NULL && ((t_ret =
1325             __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
1326                 ret = t_ret;
1327         if (rip != NULL && ((t_ret =
1328             __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
1329                 ret = t_ret;
1330         return (ret);
1331 }
1332
1333 /*
1334  * __bam_vrfy_subtree--
1335  *      Verify a subtree (or entire) btree with specified root.
1336  *
1337  *      Note that this is public because it must be called to verify
1338  *      offpage dup trees, including from hash.
1339  *
1340  * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
1341  * PUBLIC:     void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
1342  */
1343 int
1344 __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
1345         DB *dbp;
1346         VRFY_DBINFO *vdp;
1347         db_pgno_t pgno;
1348         void *l, *r;
1349         u_int32_t flags, *levelp, *nrecsp, *relenp;
1350 {
1351         BINTERNAL *li, *ri;
1352         DB *pgset;
1353         DBC *cc;
1354         DB_MPOOLFILE *mpf;
1355         ENV *env;
1356         PAGE *h;
1357         VRFY_CHILDINFO *child;
1358         VRFY_PAGEINFO *pip;
1359         db_indx_t i;
1360         db_pgno_t next_pgno, prev_pgno;
1361         db_recno_t child_nrecs, nrecs;
1362         u_int32_t child_level, child_relen, j, level, relen, stflags;
1363         u_int8_t leaf_type;
1364         int (*func) __P((DB *, const DBT *, const DBT *));
1365         int isbad, p, ret, t_ret, toplevel;
1366
1367         if (levelp != NULL)     /* Don't leave uninitialized on error. */
1368                 *levelp = 0;
1369         if (nrecsp != NULL)
1370                 *nrecsp = 0;
1371
1372         env = dbp->env;
1373         mpf = dbp->mpf;
1374         h = NULL;
1375         next_pgno = prev_pgno = PGNO_INVALID;
1376         nrecs = 0;
1377         relen = 0;
1378         leaf_type = P_INVALID;
1379         isbad = ret = 0;
1380
1381         /* Provide feedback on our progress to the application. */
1382         if (!LF_ISSET(DB_SALVAGE))
1383                 __db_vrfy_struct_feedback(dbp, vdp);
1384
1385         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
1386                 return (ret);
1387
1388         cc = NULL;
1389         level = pip->bt_level;
1390
1391         toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
1392         LF_CLR(DB_ST_TOPLEVEL);
1393
1394         /*
1395          * If this is the root, initialize the vdp's prev- and next-pgno
1396          * accounting.
1397          *
1398          * For each leaf page we hit, we'll want to make sure that
1399          * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
1400          * our page number.  Then, we'll set vdp->next_pgno to pip->next_pgno
1401          * and vdp->prev_pgno to our page number, and the next leaf page in
1402          * line should be able to do the same verification.
1403          */
1404         if (toplevel) {
1405                 /*
1406                  * Cache the values stored in the vdp so that if we're an
1407                  * auxiliary tree such as an off-page duplicate set, our
1408                  * caller's leaf page chain doesn't get lost.
1409                  */
1410                 prev_pgno = vdp->prev_pgno;
1411                 next_pgno = vdp->next_pgno;
1412                 leaf_type = vdp->leaf_type;
1413                 vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
1414                 vdp->leaf_type = P_INVALID;
1415         }
1416
1417         /*
1418          * We are recursively descending a btree, starting from the root
1419          * and working our way out to the leaves.
1420          *
1421          * There are four cases we need to deal with:
1422          *      1. pgno is a recno leaf page.  Any children are overflows.
1423          *      2. pgno is a duplicate leaf page.  Any children
1424          *         are overflow pages;  traverse them, and then return
1425          *         level and nrecs.
1426          *      3. pgno is an ordinary leaf page.  Check whether dups are
1427          *         allowed, and if so, traverse any off-page dups or
1428          *         overflows.  Then return nrecs and level.
1429          *      4. pgno is a recno internal page.  Recursively check any
1430          *         child pages, making sure their levels are one lower
1431          *         and their nrecs sum to ours.
1432          *      5. pgno is a btree internal page.  Same as #4, plus we
1433          *         must verify that for each pair of BINTERNAL entries
1434          *         N and N+1, the leftmost item on N's child sorts
1435          *         greater than N, and the rightmost item on N's child
1436          *         sorts less than N+1.
1437          *
1438          * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
1439          * we need to verify the internal sort order is correct if,
1440          * due to overflow items, we were not able to do so earlier.
1441          */
1442         switch (pip->type) {
1443         case P_LRECNO:
1444         case P_LDUP:
1445         case P_LBTREE:
1446                 /*
1447                  * Cases 1, 2 and 3.
1448                  *
1449                  * We're some sort of leaf page;  verify
1450                  * that our linked list of leaves is consistent.
1451                  */
1452                 if (vdp->leaf_type == P_INVALID) {
1453                         /*
1454                          * First leaf page.  Set the type that all its
1455                          * successors should be, and verify that our prev_pgno
1456                          * is PGNO_INVALID.
1457                          */
1458                         vdp->leaf_type = pip->type;
1459                         if (pip->prev_pgno != PGNO_INVALID)
1460                                 goto bad_prev;
1461                 } else {
1462                         /*
1463                          * Successor leaf page. Check our type, the previous
1464                          * page's next_pgno, and our prev_pgno.
1465                          */
1466                         if (pip->type != vdp->leaf_type) {
1467                                 isbad = 1;
1468                                 EPRINT((env,
1469         "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
1470                                     (u_long)pip->pgno, (u_long)pip->type,
1471                                     (u_long)vdp->leaf_type));
1472                         }
1473
1474                         /*
1475                          * Don't do the prev/next_pgno checks if we've lost
1476                          * leaf pages due to another corruption.
1477                          */
1478                         if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
1479                                 if (pip->pgno != vdp->next_pgno) {
1480                                         isbad = 1;
1481                                         EPRINT((env,
1482         "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
1483                                             (u_long)vdp->prev_pgno,
1484                                             (u_long)vdp->next_pgno,
1485                                             (u_long)pip->pgno));
1486                                 }
1487                                 if (pip->prev_pgno != vdp->prev_pgno) {
1488 bad_prev:                               isbad = 1;
1489                                         EPRINT((env,
1490     "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
1491                                             (u_long)pip->pgno,
1492                                             (u_long)pip->prev_pgno,
1493                                             (u_long)vdp->prev_pgno));
1494                                 }
1495                         }
1496                 }
1497                 vdp->prev_pgno = pip->pgno;
1498                 vdp->next_pgno = pip->next_pgno;
1499                 F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
1500
1501                 /*
1502                  * Overflow pages are common to all three leaf types;
1503                  * traverse the child list, looking for overflows.
1504                  */
1505                 if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1506                         goto err;
1507                 for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1508                     ret = __db_vrfy_ccnext(cc, &child))
1509                         if (child->type == V_OVERFLOW &&
1510                             (ret = __db_vrfy_ovfl_structure(dbp, vdp,
1511                             child->pgno, child->tlen,
1512                             flags | DB_ST_OVFL_LEAF)) != 0) {
1513                                 if (ret == DB_VERIFY_BAD)
1514                                         isbad = 1;
1515                                 else
1516                                         goto done;
1517                         }
1518
1519                 if ((ret = __db_vrfy_ccclose(cc)) != 0)
1520                         goto err;
1521                 cc = NULL;
1522
1523                 /* Case 1 */
1524                 if (pip->type == P_LRECNO) {
1525                         if (!LF_ISSET(DB_ST_IS_RECNO) &&
1526                             !(LF_ISSET(DB_ST_DUPOK) &&
1527                             !LF_ISSET(DB_ST_DUPSORT))) {
1528                                 isbad = 1;
1529                                 EPRINT((env,
1530                                     "Page %lu: recno leaf page non-recno tree",
1531                                     (u_long)pgno));
1532                                 goto done;
1533                         }
1534                         goto leaf;
1535                 } else if (LF_ISSET(DB_ST_IS_RECNO)) {
1536                         /*
1537                          * It's a non-recno leaf.  Had better not be a recno
1538                          * subtree.
1539                          */
1540                         isbad = 1;
1541                         EPRINT((env,
1542                             "Page %lu: non-recno leaf page in recno tree",
1543                             (u_long)pgno));
1544                         goto done;
1545                 }
1546
1547                 /* Case 2--no more work. */
1548                 if (pip->type == P_LDUP)
1549                         goto leaf;
1550
1551                 /* Case 3 */
1552
1553                 /* Check if we have any dups. */
1554                 if (F_ISSET(pip, VRFY_HAS_DUPS)) {
1555                         /* If dups aren't allowed in this btree, trouble. */
1556                         if (!LF_ISSET(DB_ST_DUPOK)) {
1557                                 isbad = 1;
1558                                 EPRINT((env,
1559                                     "Page %lu: duplicates in non-dup btree",
1560                                     (u_long)pgno));
1561                         } else {
1562                                 /*
1563                                  * We correctly have dups.  If any are off-page,
1564                                  * traverse those btrees recursively.
1565                                  */
1566                                 if ((ret =
1567                                     __db_vrfy_childcursor(vdp, &cc)) != 0)
1568                                         goto err;
1569                                 for (ret = __db_vrfy_ccset(cc, pgno, &child);
1570                                     ret == 0;
1571                                     ret = __db_vrfy_ccnext(cc, &child)) {
1572                                         stflags =
1573                                             flags | DB_ST_RECNUM | DB_ST_DUPSET;
1574                                         /* Skip any overflow entries. */
1575                                         if (child->type == V_DUPLICATE) {
1576                                                 if ((ret = __db_vrfy_duptype(
1577                                                     dbp, vdp, child->pgno,
1578                                                     stflags)) != 0) {
1579                                                         isbad = 1;
1580                                                         /* Next child. */
1581                                                         continue;
1582                                                 }
1583                                                 if ((ret = __bam_vrfy_subtree(
1584                                                     dbp, vdp, child->pgno,
1585                                                     NULL, NULL,
1586                                                     stflags | DB_ST_TOPLEVEL,
1587                                                     NULL, NULL, NULL)) != 0) {
1588                                                         if (ret ==
1589                                                             DB_VERIFY_BAD)
1590                                                                 isbad = 1;
1591                                                         else
1592                                                                 goto err;
1593                                                 }
1594                                         }
1595                                 }
1596
1597                                 if ((ret = __db_vrfy_ccclose(cc)) != 0)
1598                                         goto err;
1599                                 cc = NULL;
1600
1601                                 /*
1602                                  * If VRFY_DUPS_UNSORTED is set,
1603                                  * DB_ST_DUPSORT had better not be.
1604                                  */
1605                                 if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
1606                                     LF_ISSET(DB_ST_DUPSORT)) {
1607                                         isbad = 1;
1608                                         EPRINT((env,
1609                     "Page %lu: unsorted duplicate set in sorted-dup database",
1610                                             (u_long)pgno));
1611                                 }
1612                         }
1613                 }
1614                 goto leaf;
1615         case P_IBTREE:
1616         case P_IRECNO:
1617                 /* We handle these below. */
1618                 break;
1619         default:
1620                 /*
1621                  * If a P_IBTREE or P_IRECNO contains a reference to an
1622                  * invalid page, we'll wind up here;  handle it gracefully.
1623                  * Note that the code at the "done" label assumes that the
1624                  * current page is a btree/recno one of some sort;  this
1625                  * is not the case here, so we goto err.
1626                  *
1627                  * If the page is entirely zeroed, its pip->type will be a lie
1628                  * (we assumed it was a hash page, as they're allowed to be
1629                  * zeroed);  handle this case specially.
1630                  */
1631                 if (F_ISSET(pip, VRFY_IS_ALLZEROES))
1632                         ZEROPG_ERR_PRINT(env, pgno, "btree or recno page");
1633                 else
1634                         EPRINT((env,
1635             "Page %lu: btree or recno page is of inappropriate type %lu",
1636                             (u_long)pgno, (u_long)pip->type));
1637
1638                 /*
1639                  * We probably lost a leaf page (or more if this was an
1640                  * internal page) from our prev/next_pgno chain.  Flag
1641                  * that this is expected;  we don't want or need to
1642                  * spew error messages about erroneous prev/next_pgnos,
1643                  * since that's probably not the real problem.
1644                  */
1645                 F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
1646
1647                 ret = DB_VERIFY_BAD;
1648                 goto err;
1649         }
1650
1651         /*
1652          * Cases 4 & 5: This is a btree or recno internal page.  For each child,
1653          * recurse, keeping a running count of nrecs and making sure the level
1654          * is always reasonable.
1655          */
1656         if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1657                 goto err;
1658         for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1659             ret = __db_vrfy_ccnext(cc, &child))
1660                 if (child->type == V_RECNO) {
1661                         if (pip->type != P_IRECNO) {
1662                                 ret = __db_unknown_path(
1663                                     env, "__bam_vrfy_subtree");
1664                                 goto err;
1665                         }
1666                         if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
1667                             NULL, NULL, flags, &child_level, &child_nrecs,
1668                             &child_relen)) != 0) {
1669                                 if (ret == DB_VERIFY_BAD)
1670                                         isbad = 1;
1671                                 else
1672                                         goto done;
1673                         }
1674
1675                         if (LF_ISSET(DB_ST_RELEN)) {
1676                                 if (relen == 0)
1677                                         relen = child_relen;
1678                                 /*
1679                                  * child_relen may be zero if the child subtree
1680                                  * is empty.
1681                                  */
1682                                 else if (child_relen > 0 &&
1683                                     relen != child_relen) {
1684                                         isbad = 1;
1685                                         EPRINT((env,
1686                            "Page %lu: recno page returned bad re_len %lu",
1687                                             (u_long)child->pgno,
1688                                             (u_long)child_relen));
1689                                 }
1690                                 if (relenp)
1691                                         *relenp = relen;
1692                         }
1693                         if (LF_ISSET(DB_ST_RECNUM)) {
1694                                 if (child->nrecs != child_nrecs) {
1695                                         isbad = 1;
1696                                         EPRINT((env,
1697                 "Page %lu: record count incorrect: actual %lu, in record %lu",
1698                                             (u_long)child->pgno,
1699                                             (u_long)child_nrecs,
1700                                             (u_long)child->nrecs));
1701                                 }
1702                                 nrecs += child_nrecs;
1703                         }
1704                         if (isbad == 0 && level != child_level + 1) {
1705                                 isbad = 1;
1706                                 EPRINT((env,
1707                 "Page %lu: recno level incorrect: got %lu, expected %lu",
1708                                     (u_long)child->pgno, (u_long)child_level,
1709                                     (u_long)(level - 1)));
1710                         }
1711                 } else if (child->type == V_OVERFLOW) {
1712                         /*
1713                          * It is possible for one internal page to reference
1714                          * a single overflow page twice, if all the items
1715                          * in the subtree referenced by slot 0 are deleted,
1716                          * then a similar number of items are put back
1717                          * before the key that formerly had been in slot 1.
1718                          *
1719                          * (Btree doesn't look at the key in slot 0, so the
1720                          * fact that the key formerly at slot 1 is the "wrong"
1721                          * parent of the stuff in the slot 0 subtree isn't
1722                          * really incorrect.)
1723                          *
1724                          * __db_vrfy_ovfl_structure is designed to be
1725                          * efficiently called multiple times for multiple
1726                          * references;  call it here as many times as is
1727                          * appropriate.
1728                          */
1729
1730                         /* Otherwise, __db_vrfy_childput would be broken. */
1731                         DB_ASSERT(env, child->refcnt >= 1);
1732
1733                         /*
1734                          * An overflow referenced more than twice here
1735                          * shouldn't happen.
1736                          */
1737                         if (child->refcnt > 2) {
1738                                 isbad = 1;
1739                                 EPRINT((env,
1740     "Page %lu: overflow page %lu referenced more than twice from internal page",
1741                                     (u_long)pgno, (u_long)child->pgno));
1742                         } else
1743                                 for (j = 0; j < child->refcnt; j++)
1744                                         if ((ret = __db_vrfy_ovfl_structure(dbp,
1745                                             vdp, child->pgno, child->tlen,
1746                                             flags)) != 0) {
1747                                                 if (ret == DB_VERIFY_BAD)
1748                                                         isbad = 1;
1749                                                 else
1750                                                         goto done;
1751                                         }
1752                 }
1753
1754         if ((ret = __db_vrfy_ccclose(cc)) != 0)
1755                 goto err;
1756         cc = NULL;
1757
1758         /* We're done with case 4. */
1759         if (pip->type == P_IRECNO)
1760                 goto done;
1761
1762         /*
1763          * Case 5.  Btree internal pages.
1764          * As described above, we need to iterate through all the
1765          * items on the page and make sure that our children sort appropriately
1766          * with respect to them.
1767          *
1768          * For each entry, li will be the "left-hand" key for the entry
1769          * itself, which must sort lower than all entries on its child;
1770          * ri will be the key to its right, which must sort greater.
1771          */
1772         if (h == NULL &&
1773             (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
1774                 goto err;
1775         for (i = 0; i < pip->entries; i += O_INDX) {
1776                 li = GET_BINTERNAL(dbp, h, i);
1777                 ri = (i + O_INDX < pip->entries) ?
1778                     GET_BINTERNAL(dbp, h, i + O_INDX) : r;
1779
1780                 /*
1781                  * The leftmost key is forcibly sorted less than all entries,
1782                  * so don't bother passing it.
1783                  */
1784                 if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
1785                     i == 0 ? NULL : li, ri, flags, &child_level,
1786                     &child_nrecs, NULL)) != 0) {
1787                         if (ret == DB_VERIFY_BAD)
1788                                 isbad = 1;
1789                         else
1790                                 goto done;
1791                 }
1792
1793                 if (LF_ISSET(DB_ST_RECNUM)) {
1794                         /*
1795                          * Keep a running tally on the actual record count so
1796                          * we can return it to our parent (if we have one) or
1797                          * compare it to the NRECS field if we're a root page.
1798                          */
1799                         nrecs += child_nrecs;
1800
1801                         /*
1802                          * Make sure the actual record count of the child
1803                          * is equal to the value in the BINTERNAL structure.
1804                          */
1805                         if (li->nrecs != child_nrecs) {
1806                                 isbad = 1;
1807                                 EPRINT((env,
1808         "Page %lu: item %lu has incorrect record count of %lu, should be %lu",
1809                                     (u_long)pgno, (u_long)i, (u_long)li->nrecs,
1810                                     (u_long)child_nrecs));
1811                         }
1812                 }
1813
1814                 if (level != child_level + 1) {
1815                         isbad = 1;
1816                         EPRINT((env,
1817                 "Page %lu: Btree level incorrect: got %lu, expected %lu",
1818                             (u_long)li->pgno,
1819                             (u_long)child_level, (u_long)(level - 1)));
1820                 }
1821         }
1822
1823         if (0) {
1824 leaf:           level = LEAFLEVEL;
1825                 if (LF_ISSET(DB_ST_RECNUM))
1826                         nrecs = pip->rec_cnt;
1827
1828                 /* XXX
1829                  * We should verify that the record count on a leaf page
1830                  * is the sum of the number of keys and the number of
1831                  * records in its off-page dups.  This requires looking
1832                  * at the page again, however, and it may all be changing
1833                  * soon, so for now we don't bother.
1834                  */
1835
1836                 if (LF_ISSET(DB_ST_RELEN) && relenp)
1837                         *relenp = pip->re_len;
1838         }
1839 done:   if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
1840                 /*
1841                  * During the page-by-page pass, item order verification was
1842                  * not finished due to the presence of overflow items.  If
1843                  * isbad == 0, though, it's now safe to do so, as we've
1844                  * traversed any child overflow pages.  Do it.
1845                  */
1846                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1847                     vdp->thread_info, NULL, 0, &h)) != 0)
1848                         goto err;
1849                 if ((ret = __bam_vrfy_itemorder(dbp,
1850                     vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
1851                         goto err;
1852                 F_CLR(pip, VRFY_INCOMPLETE);
1853         }
1854
1855         /*
1856          * It's possible to get to this point with a page that has no
1857          * items, but without having detected any sort of failure yet.
1858          * Having zero items is legal if it's a leaf--it may be the
1859          * root page in an empty tree, or the tree may have been
1860          * modified with the DB_REVSPLITOFF flag set (there's no way
1861          * to tell from what's on disk).  For an internal page,
1862          * though, having no items is a problem (all internal pages
1863          * must have children).
1864          */
1865         if (isbad == 0 && ret == 0) {
1866                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1867                     vdp->thread_info, NULL, 0, &h)) != 0)
1868                         goto err;
1869
1870                 if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
1871                         isbad = 1;
1872                         EPRINT((env,
1873                     "Page %lu: internal page is empty and should not be",
1874                             (u_long)pgno));
1875                         goto err;
1876                 }
1877         }
1878
1879         /*
1880          * Our parent has sent us BINTERNAL pointers to parent records
1881          * so that we can verify our place with respect to them.  If it's
1882          * appropriate--we have a default sort function--verify this.
1883          */
1884         if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
1885             pip->type != P_IRECNO && pip->type != P_LRECNO) {
1886                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1887                     vdp->thread_info, NULL, 0, &h)) != 0)
1888                         goto err;
1889
1890                 /*
1891                  * __bam_vrfy_treeorder needs to know what comparison function
1892                  * to use.  If DB_ST_DUPSET is set, we're in a duplicate tree
1893                  * and we use the duplicate comparison function;  otherwise,
1894                  * use the btree one.  If unset, use the default, of course.
1895                  */
1896                 func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
1897                     ((BTREE *)dbp->bt_internal)->bt_compare;
1898                 if (func == NULL)
1899                         func = __bam_defcmp;
1900
1901                 if ((ret = __bam_vrfy_treeorder(dbp,
1902                     vdp->thread_info, h, l, r, func, flags)) != 0) {
1903                         if (ret == DB_VERIFY_BAD)
1904                                 isbad = 1;
1905                         else
1906                                 goto err;
1907                 }
1908         }
1909
1910         /*
1911          * This is guaranteed to succeed for leaf pages, but no harm done.
1912          *
1913          * Internal pages below the top level do not store their own
1914          * record numbers, so we skip them.
1915          */
1916         if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
1917                 isbad = 1;
1918                 EPRINT((env,
1919                     "Page %lu: bad record count: has %lu records, claims %lu",
1920                     (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt));
1921         }
1922
1923         if (levelp)
1924                 *levelp = level;
1925         if (nrecsp)
1926                 *nrecsp = nrecs;
1927
1928         pgset = vdp->pgset;
1929         if ((ret = __db_vrfy_pgset_get(pgset,
1930             vdp->thread_info, pgno, &p)) != 0)
1931                 goto err;
1932         if (p != 0) {
1933                 isbad = 1;
1934                 EPRINT((env, "Page %lu: linked twice", (u_long)pgno));
1935         } else if ((ret =
1936             __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0)
1937                 goto err;
1938
1939         if (toplevel)
1940                 /*
1941                  * The last page's next_pgno in the leaf chain should have been
1942                  * PGNO_INVALID.
1943                  */
1944                 if (vdp->next_pgno != PGNO_INVALID) {
1945                         isbad = 1;
1946                         EPRINT((env, "Page %lu: unterminated leaf chain",
1947                             (u_long)vdp->prev_pgno));
1948                 }
1949
1950 err:    if (toplevel) {
1951                 /* Restore our caller's settings. */
1952                 vdp->next_pgno = next_pgno;
1953                 vdp->prev_pgno = prev_pgno;
1954                 vdp->leaf_type = leaf_type;
1955         }
1956
1957         if (h != NULL && (t_ret = __memp_fput(mpf,
1958             vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
1959                 ret = t_ret;
1960         if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
1961                 ret = t_ret;
1962         if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
1963                 ret = t_ret;
1964         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
1965 }
1966
1967 /*
1968  * __bam_vrfy_treeorder --
1969  *      Verify that the lowest key on a page sorts greater than the
1970  *      BINTERNAL which points to it (lp), and the highest key
1971  *      sorts less than the BINTERNAL above that (rp).
1972  *
1973  *      If lp is NULL, this means that it was the leftmost key on the
1974  *      parent, which (regardless of sort function) sorts less than
1975  *      all keys.  No need to check it.
1976  *
1977  *      If rp is NULL, lp was the highest key on the parent, so there's
1978  *      no higher key we must sort less than.
1979  */
1980 static int
1981 __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
1982         DB *dbp;
1983         DB_THREAD_INFO *ip;
1984         PAGE *h;
1985         BINTERNAL *lp, *rp;
1986         int (*func) __P((DB *, const DBT *, const DBT *));
1987         u_int32_t flags;
1988 {
1989         BOVERFLOW *bo;
1990         DBC *dbc;
1991         DBT dbt;
1992         ENV *env;
1993         db_indx_t last;
1994         int ret, cmp;
1995
1996         env = dbp->env;
1997         memset(&dbt, 0, sizeof(DBT));
1998         F_SET(&dbt, DB_DBT_MALLOC);
1999         ret = 0;
2000
2001         /*
2002          * Empty pages are sorted correctly by definition.  We check
2003          * to see whether they ought to be empty elsewhere;  leaf
2004          * pages legally may be.
2005          */
2006         if (NUM_ENT(h) == 0)
2007                 return (0);
2008
2009         switch (TYPE(h)) {
2010         case P_IBTREE:
2011         case P_LDUP:
2012                 last = NUM_ENT(h) - O_INDX;
2013                 break;
2014         case P_LBTREE:
2015                 last = NUM_ENT(h) - P_INDX;
2016                 break;
2017         default:
2018                 return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
2019         }
2020
2021         /* Populate a dummy cursor. */
2022         if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2023             PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2024                 return (ret);
2025         /*
2026          * The key on page h, the child page, is more likely to be
2027          * an overflow page, so we pass its offset, rather than lp/rp's,
2028          * into __bam_cmp.  This will take advantage of __db_moff.
2029          */
2030
2031         /*
2032          * Skip first-item check if we're an internal page--the first
2033          * entry on an internal page is treated specially by __bam_cmp,
2034          * so what's on the page shouldn't matter.  (Plus, since we're passing
2035          * our page and item 0 as to __bam_cmp, we'll sort before our
2036          * parent and falsely report a failure.)
2037          */
2038         if (lp != NULL && TYPE(h) != P_IBTREE) {
2039                 if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2040                     PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2041                         return (ret);
2042                 if (lp->type == B_KEYDATA) {
2043                         dbt.data = lp->data;
2044                         dbt.size = lp->len;
2045                 } else if (lp->type == B_OVERFLOW) {
2046                         bo = (BOVERFLOW *)lp->data;
2047                         if ((ret = __db_goff(dbc, &dbt,
2048                             bo->tlen, bo->pgno, NULL, NULL)) != 0)
2049                                 return (ret);
2050                 } else
2051                         return (
2052                             __db_unknown_path(env, "__bam_vrfy_treeorder"));
2053
2054                 /* On error, fall through, free if needed, and return. */
2055                 if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
2056                         if (cmp > 0) {
2057                                 EPRINT((env,
2058             "Page %lu: first item on page sorted greater than parent entry",
2059                                     (u_long)PGNO(h)));
2060                                 ret = DB_VERIFY_BAD;
2061                         }
2062                 } else
2063                         EPRINT((env,
2064                             "Page %lu: first item on page had comparison error",
2065                             (u_long)PGNO(h)));
2066
2067                 if (dbt.data != lp->data)
2068                         __os_ufree(env, dbt.data);
2069                 if (ret != 0)
2070                         return (ret);
2071         }
2072
2073         if (rp != NULL) {
2074                 if (rp->type == B_KEYDATA) {
2075                         dbt.data = rp->data;
2076                         dbt.size = rp->len;
2077                 } else if (rp->type == B_OVERFLOW) {
2078                         bo = (BOVERFLOW *)rp->data;
2079                         if ((ret = __db_goff(dbc, &dbt,
2080                             bo->tlen, bo->pgno, NULL, NULL)) != 0)
2081                                 return (ret);
2082                 } else
2083                         return (
2084                             __db_unknown_path(env, "__bam_vrfy_treeorder"));
2085
2086                 /* On error, fall through, free if needed, and return. */
2087                 if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
2088                         if (cmp < 0) {
2089                                 EPRINT((env,
2090             "Page %lu: last item on page sorted greater than parent entry",
2091                                     (u_long)PGNO(h)));
2092                                 ret = DB_VERIFY_BAD;
2093                         }
2094                 } else
2095                         EPRINT((env,
2096                             "Page %lu: last item on page had comparison error",
2097                             (u_long)PGNO(h)));
2098
2099                 if (dbt.data != rp->data)
2100                         __os_ufree(env, dbt.data);
2101         }
2102
2103         return (ret);
2104 }
2105
2106 /*
2107  * __bam_salvage --
2108  *      Safely dump out anything that looks like a key on an alleged
2109  *      btree leaf page, also mark overflow pages as seen.  For internal btree
2110  *      pages, just mark any overflow pages as seen.
2111  *
2112  * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
2113  * PUBLIC:     db_pgno_t, u_int32_t, PAGE *, void *,
2114  * PUBLIC:     int (*)(void *, const void *), DBT *, u_int32_t));
2115  */
2116 int
2117 __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
2118         DB *dbp;
2119         VRFY_DBINFO *vdp;
2120         db_pgno_t pgno;
2121         u_int32_t pgtype;
2122         PAGE *h;
2123         void *handle;
2124         int (*callback) __P((void *, const void *));
2125         DBT *key;
2126         u_int32_t flags;
2127 {
2128         BKEYDATA *bk;
2129         BOVERFLOW *bo;
2130         DBT dbt, repldbt, unknown_key, unknown_data;
2131         ENV *env;
2132         VRFY_ITEM *pgmap;
2133         db_indx_t i, last, beg, end, *inp;
2134         db_pgno_t ovflpg;
2135         u_int32_t himark, ovfl_bufsz;
2136         void *ovflbuf;
2137         int adj, ret, t_ret, t2_ret;
2138 #ifdef HAVE_COMPRESSION
2139         DBT kcpy, *last_key;
2140         int unknown_dup_key;
2141 #endif
2142
2143         env = dbp->env;
2144         ovflbuf = pgmap = NULL;
2145         inp = P_INP(dbp, h);
2146
2147         memset(&dbt, 0, sizeof(DBT));
2148         dbt.flags = DB_DBT_REALLOC;
2149         memset(&repldbt, 0, sizeof(DBT));
2150
2151 #ifdef HAVE_COMPRESSION
2152         memset(&kcpy, 0, sizeof(DBT));
2153         unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY);
2154         last_key = unknown_dup_key ? NULL : key;
2155 #endif
2156         LF_CLR(DB_SA_UNKNOWNKEY);
2157
2158         DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
2159         DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
2160
2161         /*
2162          * Allocate a buffer for overflow items.  Start at one page;
2163          * __db_safe_goff will realloc as needed.
2164          */
2165         if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
2166                 goto err;
2167         ovfl_bufsz = dbp->pgsize;
2168
2169         if (LF_ISSET(DB_AGGRESSIVE) && (ret =
2170             __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
2171                 goto err;
2172
2173         /*
2174          * Loop through the inp array, spitting out key/data pairs.
2175          *
2176          * If we're salvaging normally, loop from 0 through NUM_ENT(h).  If
2177          * we're being aggressive, loop until we hit the end of the page --
2178          * NUM_ENT() may be bogus.
2179          */
2180         himark = dbp->pgsize;
2181         for (i = 0, last = UINT16_MAX;; i += O_INDX) {
2182                 /*
2183                  * If we're not aggressive, or if we're on an internal page,
2184                  * break when we hit NUM_ENT(h).
2185                  */
2186                 if ((!LF_ISSET(DB_AGGRESSIVE) ||
2187                     pgtype == P_IBTREE) && i >= NUM_ENT(h))
2188                         break;
2189
2190                 /* Verify the current item. */
2191                 t_ret =
2192                     __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
2193
2194                 if (t_ret != 0) {
2195                         /*
2196                          * If this is a btree leaf and we've printed out a key
2197                          * but not its associated data item, fix this imbalance
2198                          * by printing an "UNKNOWN_DATA".
2199                          */
2200                         if (pgtype == P_LBTREE && i % P_INDX == 1 &&
2201                             last == i - 1 && (t2_ret = __db_vrfy_prdbt(
2202                             &unknown_data,
2203                             0, " ", handle, callback, 0, vdp)) != 0) {
2204                                 if (ret == 0)
2205                                         ret = t2_ret;
2206                                 goto err;
2207                         }
2208
2209                         /*
2210                          * Don't return DB_VERIFY_FATAL; it's private and means
2211                          * only that we can't go on with this page, not with
2212                          * the whole database.  It's not even an error if we've
2213                          * run into it after NUM_ENT(h).
2214                          */
2215                         if (t_ret == DB_VERIFY_FATAL) {
2216                                 if (i < NUM_ENT(h) && ret == 0)
2217                                         ret = DB_VERIFY_BAD;
2218                                 break;
2219                         }
2220                         continue;
2221                 }
2222
2223                 /*
2224                  * If this returned 0, it's safe to print or (carefully)
2225                  * try to fetch.
2226                  *
2227                  * We only print deleted items if DB_AGGRESSIVE is set.
2228                  */
2229                 bk = GET_BKEYDATA(dbp, h, i);
2230                 if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
2231                         continue;
2232
2233                 /*
2234                  * If this is a btree leaf and we're about to print out a data
2235                  * item for which we didn't print out a key, fix this imbalance
2236                  * by printing an "UNKNOWN_KEY".
2237                  */
2238                 if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) {
2239 #ifdef HAVE_COMPRESSION
2240                         last_key = NULL;
2241 #endif
2242                         if ((t_ret = __db_vrfy_prdbt(&unknown_key,
2243                             0, " ", handle, callback, 0, vdp)) != 0) {
2244                                 if (ret == 0)
2245                                         ret = t_ret;
2246                                 goto err;
2247                         }
2248                 }
2249                 last = i;
2250
2251                 /*
2252                  * We're going to go try to print the next item.  If key is
2253                  * non-NULL, we're a dup page, so we've got to print the key
2254                  * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
2255                  * first entry.
2256                  */
2257                 if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) {
2258 #ifdef HAVE_COMPRESSION
2259                         last_key = unknown_dup_key ? NULL : key;
2260 #endif
2261                         if ((t_ret = __db_vrfy_prdbt(key,
2262                             0, " ", handle, callback, 0, vdp)) != 0) {
2263                                 if (ret == 0)
2264                                         ret = t_ret;
2265                                 goto err;
2266                         }
2267                 }
2268
2269                 beg = end = inp[i];
2270                 switch (B_TYPE(bk->type)) {
2271                 case B_DUPLICATE:
2272                         if (pgtype == P_IBTREE)
2273                                 break;
2274
2275                         end = beg + BOVERFLOW_SIZE - 1;
2276                         /*
2277                          * If we're not on a normal btree leaf page, there
2278                          * shouldn't be off-page dup sets.  Something's
2279                          * confused; just drop it, and the code to pick up
2280                          * unlinked offpage dup sets will print it out
2281                          * with key "UNKNOWN" later.
2282                          */
2283                         if (pgtype != P_LBTREE)
2284                                 break;
2285
2286                         bo = (BOVERFLOW *)bk;
2287
2288                         /*
2289                          * If the page number is unreasonable, or if this is
2290                          * supposed to be a key item, output "UNKNOWN_KEY" --
2291                          * the best we can do is run into the data items in
2292                          * the unlinked offpage dup pass.
2293                          */
2294                         if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
2295                                 /* Not much to do on failure. */
2296 #ifdef HAVE_COMPRESSION
2297                                 if (key == NULL && i % P_INDX == 0)
2298                                         last_key = NULL;
2299 #endif
2300                                 if ((t_ret = __db_vrfy_prdbt(
2301                                  i % P_INDX == 0 ? &unknown_key : &unknown_data,
2302                                     0, " ", handle, callback, 0, vdp)) != 0) {
2303                                         if (ret == 0)
2304                                                 ret = t_ret;
2305                                         goto err;
2306                                 }
2307                                 break;
2308                         }
2309
2310                         /* Don't stop on error. */
2311                         if ((t_ret = __db_salvage_duptree(dbp,
2312                             vdp, bo->pgno, &dbt, handle, callback,
2313                             flags | DB_SA_SKIPFIRSTKEY
2314 #ifdef HAVE_COMPRESSION
2315                             | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0)
2316 #endif
2317                             )) != 0 && ret == 0)
2318                                 ret = t_ret;
2319
2320                         break;
2321                 case B_KEYDATA:
2322                         if (pgtype == P_IBTREE)
2323                                 break;
2324
2325                         end = (db_indx_t)DB_ALIGN(
2326                             beg + bk->len, sizeof(u_int32_t)) - 1;
2327
2328                         dbt.data = bk->data;
2329                         dbt.size = bk->len;
2330
2331 #ifdef HAVE_COMPRESSION
2332                         if (DB_IS_COMPRESSED(dbp) && last_key != NULL &&
2333                             (key != NULL || (i % P_INDX == 1))) {
2334                                 /* Decompress the key/data pair  - the key
2335                                    is in last_key, and the data is in dbt */
2336                                 if ((t_ret = __bam_compress_salvage(dbp, vdp,
2337                                     handle, callback, last_key, &dbt)) != 0) {
2338                                         if (t_ret == DB_VERIFY_FATAL) {
2339                                                 if (ret == 0)
2340                                                         ret = DB_VERIFY_BAD;
2341                                                 if (!LF_ISSET(DB_AGGRESSIVE))
2342                                                         goto err;
2343                                         } else if (ret == 0) {
2344                                                 ret = t_ret;
2345                                                 goto err;
2346                                         }
2347                                 }
2348                         } else {
2349                                 if (key == NULL && i % P_INDX == 0) {
2350                                         if ((ret = __os_realloc(
2351                                             env, dbt.size, &kcpy.data)) != 0)
2352                                                 goto err;
2353                                         memcpy(kcpy.data, dbt.data, dbt.size);
2354                                         kcpy.size = dbt.size;
2355                                         last_key = &kcpy;
2356                                 }
2357 #endif
2358
2359                                 if ((t_ret = __db_vrfy_prdbt(&dbt,
2360                                     0, " ", handle, callback, 0, vdp)) != 0) {
2361                                         if (ret == 0)
2362                                                 ret = t_ret;
2363                                         goto err;
2364                                 }
2365 #ifdef HAVE_COMPRESSION
2366                         }
2367 #endif
2368                         break;
2369                 case B_OVERFLOW:
2370                         if (pgtype != P_IBTREE)
2371                                 end = beg + BOVERFLOW_SIZE - 1;
2372                         bo = (BOVERFLOW *)bk;
2373
2374                         /*
2375                          * Check for replicated overflow keys, so that we only
2376                          * call __db_safe_goff once per overflow page.  If we
2377                          * get the same offset as the previous key just re-use
2378                          * the previous dbt.
2379                          *
2380                          * P_IBTREE pages will never have replicated overflow
2381                          * keys.
2382                          */
2383                         adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
2384                         if (pgtype == P_IBTREE) {
2385                                 /*
2386                                  * If we're looking at a P_IBTREE, we just want
2387                                  * to mark the overflow page as seen.
2388                                  *
2389                                  * Note that this call to __db_safe_goff differs
2390                                  * from the non-P_IBTREE call.
2391                                  *
2392                                  * Only call __db_safe_goff if the overflow page
2393                                  * hasn't been seen.
2394                                  */
2395                                 ovflpg = ((BOVERFLOW *)
2396                                     ((BINTERNAL *)bk)->data)->pgno;
2397                                 if (__db_salvage_isdone(vdp, ovflpg) == 0 &&
2398                                     (t_ret =__db_safe_goff(dbp, vdp, ovflpg,
2399                                         &dbt, &ovflbuf,
2400                                         &ovfl_bufsz, flags)) != 0 && ret == 0)
2401                                         ret = t_ret;
2402                                 break;
2403                         } else if (i > adj - 1 &&
2404                             i % adj == 0 && inp[i] == inp[i - adj])
2405                                 dbt = repldbt;
2406                         else {
2407                                 /* Don't stop on error. */
2408                                 if ((t_ret = __db_safe_goff(dbp, vdp,
2409                                     bo->pgno, &dbt, &ovflbuf,
2410                                     &ovfl_bufsz, flags)) != 0 && ret == 0)
2411                                         ret = t_ret;
2412
2413                                 /*
2414                                  * If this is a key, save it in case the next
2415                                  * key is a replicated overflow, so we don't
2416                                  * call __db_safe_goff again.  Copy out dbt.data
2417                                  * in case that pointer gets realloc'd when
2418                                  * getting a data item.
2419                                  */
2420                                 if (i % P_INDX == 0) {
2421                                         if (t_ret == 0) {
2422                                                 if ((t_ret = __os_realloc(env,
2423                                                         dbt.size,
2424                                                         &repldbt.data)) != 0) {
2425                                                         if (ret == 0)
2426                                                                 ret = t_ret;
2427                                                         goto err;
2428                                                 }
2429                                                 memcpy(repldbt.data,
2430                                                     dbt.data, dbt.size);
2431                                                 repldbt.size = dbt.size;
2432                                         } else {
2433                                                 if (__os_realloc(env,
2434                                                     unknown_key.size,
2435                                                     &repldbt.data) != 0)
2436                                                         goto err;
2437                                                 memcpy(repldbt.data,
2438                                                     unknown_key.data,
2439                                                     unknown_key.size);
2440                                                 repldbt.size = unknown_key.size;
2441                                         }
2442                                 }
2443
2444                         }
2445
2446 #ifdef HAVE_COMPRESSION
2447                         if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 &&
2448                             (key != NULL || (i % P_INDX == 1))) {
2449                                 /* Decompress the key/data pair  - the key
2450                                    is in last_key, and the data is in dbt */
2451                                 if ((t_ret = __bam_compress_salvage(dbp, vdp,
2452                                     handle, callback, last_key, &dbt)) != 0) {
2453                                         if (t_ret == DB_VERIFY_FATAL) {
2454                                                 if (ret == 0)
2455                                                         ret = DB_VERIFY_BAD;
2456                                                 if (!LF_ISSET(DB_AGGRESSIVE))
2457                                                         goto err;
2458                                         } else if (ret == 0) {
2459                                                 ret = t_ret;
2460                                                 goto err;
2461                                         }
2462                                 }
2463                         } else {
2464                                 if (key == NULL && i % P_INDX == 0) {
2465                                         if (t_ret == 0) {
2466                                                 if ((ret = __os_realloc(env,
2467                                                     dbt.size, &kcpy.data)) != 0)
2468                                                         goto err;
2469                                                 memcpy(kcpy.data, dbt.data,
2470                                                         dbt.size);
2471                                                 kcpy.size = dbt.size;
2472                                                 last_key = &kcpy;
2473                                         } else
2474                                                 last_key = NULL;
2475                                 }
2476 #endif
2477
2478                                 if ((t_ret = __db_vrfy_prdbt(
2479                                              t_ret == 0 ? &dbt : &unknown_key,
2480                                              0, " ", handle, callback, 0, vdp))
2481                                         != 0 && ret == 0)
2482                                         ret = t_ret;
2483 #ifdef HAVE_COMPRESSION
2484                         }
2485 #endif
2486                         break;
2487                 default:
2488                         /*
2489                          * We should never get here; __db_vrfy_inpitem should
2490                          * not be returning 0 if bk->type is unrecognizable.
2491                          */
2492                         t_ret = __db_unknown_path(env, "__bam_salvage");
2493                         if (ret == 0)
2494                                 ret = t_ret;
2495                         goto err;
2496                 }
2497
2498                 /*
2499                  * If we're being aggressive, mark the beginning and end of
2500                  * the item; we'll come back and print whatever "junk" is in
2501                  * the gaps in case we had any bogus inp elements and thereby
2502                  * missed stuff.
2503                  */
2504                 if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
2505                         pgmap[beg] = VRFY_ITEM_BEGIN;
2506                         pgmap[end] = VRFY_ITEM_END;
2507                 }
2508         }
2509
2510 err:    if (pgmap != NULL)
2511                 __os_free(env, pgmap);
2512         if (ovflbuf != NULL)
2513                 __os_free(env, ovflbuf);
2514         if (repldbt.data != NULL)
2515                 __os_free(env, repldbt.data);
2516 #ifdef HAVE_COMPRESSION
2517         if (kcpy.data != NULL)
2518                 __os_free(env, kcpy.data);
2519 #endif
2520
2521         /* Mark this page as done. */
2522         if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
2523                 ret = t_ret;
2524
2525         return (ret);
2526 }
2527
2528 /*
2529  * __bam_salvage_walkdupint --
2530  *      Walk a known-good btree or recno internal page which is part of
2531  *      a dup tree, calling __db_salvage_duptree on each child page.
2532  *
2533  * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
2534  * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
2535  */
2536 int
2537 __bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
2538         DB *dbp;
2539         VRFY_DBINFO *vdp;
2540         PAGE *h;
2541         DBT *key;
2542         void *handle;
2543         int (*callback) __P((void *, const void *));
2544         u_int32_t flags;
2545 {
2546         BINTERNAL *bi;
2547         ENV *env;
2548         RINTERNAL *ri;
2549         int ret, t_ret;
2550         db_indx_t i;
2551
2552         env = dbp->env;
2553         ret = 0;
2554
2555         for (i = 0; i < NUM_ENT(h); i++) {
2556                 switch (TYPE(h)) {
2557                 case P_IBTREE:
2558                         bi = GET_BINTERNAL(dbp, h, i);
2559                         if ((t_ret = __db_salvage_duptree(dbp,
2560                             vdp, bi->pgno, key, handle, callback, flags)) != 0)
2561                                 ret = t_ret;
2562                         break;
2563                 case P_IRECNO:
2564                         ri = GET_RINTERNAL(dbp, h, i);
2565                         if ((t_ret = __db_salvage_duptree(dbp,
2566                             vdp, ri->pgno, key, handle, callback, flags)) != 0)
2567                                 ret = t_ret;
2568                         break;
2569                 default:
2570                         return (__db_unknown_path(
2571                             env, "__bam_salvage_walkdupint"));
2572                 }
2573                 /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
2574                 flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
2575         }
2576
2577         return (ret);
2578 }
2579
2580 /*
2581  * __bam_meta2pgset --
2582  *      Given a known-good meta page, return in pgsetp a 0-terminated list of
2583  *      db_pgno_t's corresponding to the pages in the btree.
2584  *
2585  *      We do this by a somewhat sleazy method, to avoid having to traverse the
2586  *      btree structure neatly:  we walk down the left side to the very
2587  *      first leaf page, then we mark all the pages in the chain of
2588  *      NEXT_PGNOs (being wary of cycles and invalid ones), then we
2589  *      consolidate our scratch array into a nice list, and return.  This
2590  *      avoids the memory management hassles of recursion and the
2591  *      trouble of walking internal pages--they just don't matter, except
2592  *      for the left branch.
2593  *
2594  * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
2595  * PUBLIC:     u_int32_t, DB *));
2596  */
2597 int
2598 __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
2599         DB *dbp;
2600         VRFY_DBINFO *vdp;
2601         BTMETA *btmeta;
2602         u_int32_t flags;
2603         DB *pgset;
2604 {
2605         BINTERNAL *bi;
2606         DB_MPOOLFILE *mpf;
2607         PAGE *h;
2608         RINTERNAL *ri;
2609         db_pgno_t current, p;
2610         int err_ret, ret;
2611
2612         DB_ASSERT(dbp->env, pgset != NULL);
2613
2614         mpf = dbp->mpf;
2615         h = NULL;
2616         ret = err_ret = 0;
2617
2618         for (current = btmeta->root;;) {
2619                 if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
2620                         err_ret = DB_VERIFY_BAD;
2621                         goto err;
2622                 }
2623                 if ((ret = __memp_fget(mpf, &current,
2624                      vdp->thread_info, NULL, 0, &h)) != 0) {
2625                         err_ret = ret;
2626                         goto err;
2627                 }
2628
2629                 switch (TYPE(h)) {
2630                 case P_IBTREE:
2631                 case P_IRECNO:
2632                         if ((ret = __bam_vrfy(dbp,
2633                             vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
2634                                 err_ret = ret;
2635                                 goto err;
2636                         }
2637                         if (TYPE(h) == P_IBTREE) {
2638                                 bi = GET_BINTERNAL(dbp, h, 0);
2639                                 current = bi->pgno;
2640                         } else {        /* P_IRECNO */
2641                                 ri = GET_RINTERNAL(dbp, h, 0);
2642                                 current = ri->pgno;
2643                         }
2644                         break;
2645                 case P_LBTREE:
2646                 case P_LRECNO:
2647                         goto traverse;
2648                 default:
2649                         err_ret = DB_VERIFY_BAD;
2650                         goto err;
2651                 }
2652
2653                 if ((ret = __memp_fput(mpf,
2654                      vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2655                         err_ret = ret;
2656                 h = NULL;
2657         }
2658
2659         /*
2660          * At this point, current is the pgno of leaf page h, the 0th in the
2661          * tree we're concerned with.
2662          */
2663 traverse:
2664         while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
2665                 if (h == NULL && (ret = __memp_fget(mpf,
2666                     &current, vdp->thread_info, NULL, 0, &h)) != 0) {
2667                         err_ret = ret;
2668                         break;
2669                 }
2670
2671                 if ((ret = __db_vrfy_pgset_get(pgset,
2672                     vdp->thread_info, current, (int *)&p)) != 0)
2673                         goto err;
2674
2675                 if (p != 0) {
2676                         /*
2677                          * We've found a cycle.  Return success anyway--
2678                          * our caller may as well use however much of
2679                          * the pgset we've come up with.
2680                          */
2681                         break;
2682                 }
2683                 if ((ret =
2684                     __db_vrfy_pgset_inc(pgset, vdp->thread_info, current)) != 0)
2685                         goto err;
2686
2687                 current = NEXT_PGNO(h);
2688                 if ((ret = __memp_fput(mpf,
2689                      vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2690                         err_ret = ret;
2691                 h = NULL;
2692         }
2693
2694 err:    if (h != NULL)
2695                 (void)__memp_fput(mpf,
2696                     vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
2697
2698         return (ret == 0 ? err_ret : ret);
2699 }
2700
2701 /*
2702  * __bam_safe_getdata --
2703  *
2704  *      Utility function for __bam_vrfy_itemorder.  Safely gets the datum at
2705  *      index i, page h, and sticks it in DBT dbt.  If ovflok is 1 and i's an
2706  *      overflow item, we do a safe_goff to get the item and signal that we need
2707  *      to free dbt->data;  if ovflok is 0, we leaves the DBT zeroed.
2708  */
2709 static int
2710 __bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
2711         DB *dbp;
2712         DB_THREAD_INFO *ip;
2713         PAGE *h;
2714         u_int32_t i;
2715         int ovflok;
2716         DBT *dbt;
2717         int *freedbtp;
2718 {
2719         BKEYDATA *bk;
2720         BOVERFLOW *bo;
2721         DBC *dbc;
2722         int ret;
2723
2724         memset(dbt, 0, sizeof(DBT));
2725         *freedbtp = 0;
2726
2727         bk = GET_BKEYDATA(dbp, h, i);
2728         if (B_TYPE(bk->type) == B_OVERFLOW) {
2729                 if (!ovflok)
2730                         return (0);
2731
2732                 if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2733                     PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2734                         return (ret);
2735                 bo = (BOVERFLOW *)bk;
2736                 F_SET(dbt, DB_DBT_MALLOC);
2737
2738                 *freedbtp = 1;
2739                 return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL));
2740         } else {
2741                 dbt->data = bk->data;
2742                 dbt->size = bk->len;
2743         }
2744
2745         return (0);
2746 }