Imported Upstream version 5.3.21
[platform/upstream/libdb.git] / src / btree / bt_verify.c
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
5  *
6  * $Id$
7  */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_verify.h"
14 #include "dbinc/btree.h"
15 #include "dbinc/lock.h"
16 #include "dbinc/mp.h"
17
18 static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
19     PAGE *, u_int32_t, int, DBT *, int *));
20 static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
21     db_indx_t *, u_int32_t));
22 static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
23     BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
24     u_int32_t));
25 static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
26     db_indx_t *, u_int32_t));
27
28 /*
29  * __bam_vrfy_meta --
30  *      Verify the btree-specific part of a metadata page.
31  *
32  * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
33  * PUBLIC:     db_pgno_t, u_int32_t));
34  */
35 int
36 __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
37         DB *dbp;
38         VRFY_DBINFO *vdp;
39         BTMETA *meta;
40         db_pgno_t pgno;
41         u_int32_t flags;
42 {
43         ENV *env;
44         VRFY_PAGEINFO *pip;
45         int isbad, t_ret, ret;
46         db_indx_t ovflsize;
47
48         env = dbp->env;
49         isbad = 0;
50
51         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
52                 return (ret);
53
54         /*
55          * If we came through __db_vrfy_pagezero, we have already checked the
56          * common fields.  However, we used the on-disk metadata page, it may
57          * have been stale.  We now have the page from mpool, so check that.
58          */
59         if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
60                 if (ret == DB_VERIFY_BAD)
61                         isbad = 1;
62                 else
63                         goto err;
64         }
65
66         /* bt_minkey:  must be >= 2; must produce sensible ovflsize */
67
68         /* avoid division by zero */
69         ovflsize = meta->minkey > 0 ?
70             B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
71
72         if (meta->minkey < 2 ||
73             ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
74                 pip->bt_minkey = 0;
75                 isbad = 1;
76                 EPRINT((env, DB_STR_A("1034",
77             "Page %lu: nonsensical bt_minkey value %lu on metadata page",
78                     "%lu %lu"), (u_long)pgno, (u_long)meta->minkey));
79         } else
80                 pip->bt_minkey = meta->minkey;
81
82         /* re_len: no constraints on this (may be zero or huge--we make rope) */
83         pip->re_pad = meta->re_pad;
84         pip->re_len = meta->re_len;
85
86         /*
87          * The root must not be current page or 0 and it must be within
88          * database.  If this metadata page is the master meta data page
89          * of the file, then the root page had better be page 1.
90          */
91         pip->root = 0;
92         if (meta->root == PGNO_INVALID ||
93             meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
94             (pgno == PGNO_BASE_MD && meta->root != 1)) {
95                 isbad = 1;
96                 EPRINT((env, DB_STR_A("1035",
97                     "Page %lu: nonsensical root page %lu on metadata page",
98                     "%lu %lu"), (u_long)pgno, (u_long)meta->root));
99         } else
100                 pip->root = meta->root;
101
102         /* Flags. */
103         if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
104                 F_SET(pip, VRFY_IS_RRECNO);
105
106         if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
107                 /*
108                  * If this is a master db meta page, it had better not have
109                  * duplicates.
110                  */
111                 if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
112                         isbad = 1;
113                         EPRINT((env, DB_STR_A("1036",
114 "Page %lu: Btree metadata page has both duplicates and multiple databases",
115                             "%lu"), (u_long)pgno));
116                 }
117                 F_SET(pip, VRFY_HAS_SUBDBS);
118         }
119
120         if (F_ISSET(&meta->dbmeta, BTM_DUP))
121                 F_SET(pip, VRFY_HAS_DUPS);
122         if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
123                 F_SET(pip, VRFY_HAS_DUPSORT);
124         if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
125                 F_SET(pip, VRFY_HAS_RECNUMS);
126         if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
127                 EPRINT((env, DB_STR_A("1037",
128     "Page %lu: Btree metadata page illegally has both recnums and dups",
129                     "%lu"), (u_long)pgno));
130                 isbad = 1;
131         }
132
133         if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
134                 F_SET(pip, VRFY_IS_RECNO);
135                 dbp->type = DB_RECNO;
136         } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
137                 isbad = 1;
138                 EPRINT((env, DB_STR_A("1038",
139     "Page %lu: metadata page has renumber flag set but is not recno",
140                     "%lu"), (u_long)pgno));
141         }
142
143 #ifdef HAVE_COMPRESSION
144         if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) {
145                 F_SET(pip, VRFY_HAS_COMPRESS);
146                 if (!DB_IS_COMPRESSED(dbp)) {
147                         ((BTREE *)dbp->bt_internal)->bt_compress =
148                             __bam_defcompress;
149                         ((BTREE *)dbp->bt_internal)->bt_decompress =
150                             __bam_defdecompress;
151                 }
152                 /*
153                  * Copy dup_compare to compress_dup_compare, and use the
154                  * compression duplicate compare.
155                  */
156                 if (F_ISSET(pip, VRFY_HAS_DUPSORT)) {
157                         if (dbp->dup_compare == NULL)
158                                 dbp->dup_compare = __bam_defcmp;
159                         if (((BTREE *)dbp->bt_internal)->compress_dup_compare
160                                 == NULL) {
161                                 ((BTREE *)dbp->bt_internal)->
162                                         compress_dup_compare = dbp->dup_compare;
163                                 dbp->dup_compare = __bam_compress_dupcmp;
164                         }
165                 }
166         }
167
168         if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) {
169                 EPRINT((env, DB_STR_A("1039",
170     "Page %lu: Btree metadata page illegally has both recnums and compression",
171                     "%lu"), (u_long)pgno));
172                 isbad = 1;
173         }
174         if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) &&
175             F_ISSET(pip, VRFY_HAS_COMPRESS)) {
176                 EPRINT((env, DB_STR_A("1040",
177                     "Page %lu: Btree metadata page illegally has both "
178                     "unsorted duplicates and compression",
179                     "%lu"), (u_long)pgno));
180                 isbad = 1;
181         }
182 #endif
183
184         if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
185                 EPRINT((env, DB_STR_A("1041",
186                     "Page %lu: recno metadata page specifies duplicates",
187                     "%lu"), (u_long)pgno));
188                 isbad = 1;
189         }
190
191         if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
192                 F_SET(pip, VRFY_IS_FIXEDLEN);
193         else if (pip->re_len > 0) {
194                 /*
195                  * It's wrong to have an re_len if it's not a fixed-length
196                  * database
197                  */
198                 isbad = 1;
199                 EPRINT((env, DB_STR_A("1042",
200                     "Page %lu: re_len of %lu in non-fixed-length database",
201                     "%lu %lu"), (u_long)pgno, (u_long)pip->re_len));
202         }
203
204         /*
205          * We do not check that the rest of the page is 0, because it may
206          * not be and may still be correct.
207          */
208
209 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
210                 ret = t_ret;
211         if (LF_ISSET(DB_SALVAGE) &&
212            (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
213                 ret = t_ret;
214         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
215 }
216
217 /*
218  * __ram_vrfy_leaf --
219  *      Verify a recno leaf page.
220  *
221  * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
222  * PUBLIC:     u_int32_t));
223  */
224 int
225 __ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
226         DB *dbp;
227         VRFY_DBINFO *vdp;
228         PAGE *h;
229         db_pgno_t pgno;
230         u_int32_t flags;
231 {
232         BKEYDATA *bk;
233         ENV *env;
234         VRFY_PAGEINFO *pip;
235         db_indx_t i;
236         int ret, t_ret, isbad;
237         u_int32_t re_len_guess, len;
238
239         env = dbp->env;
240         isbad = 0;
241
242         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
243                 return (ret);
244
245         if (TYPE(h) != P_LRECNO) {
246                 ret = __db_unknown_path(env, "__ram_vrfy_leaf");
247                 goto err;
248         }
249
250         /*
251          * Verify (and, if relevant, save off) page fields common to
252          * all PAGEs.
253          */
254         if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
255                 if (ret == DB_VERIFY_BAD)
256                         isbad = 1;
257                 else
258                         goto err;
259         }
260
261         /*
262          * Verify inp[].  Return immediately if it returns DB_VERIFY_BAD;
263          * further checks are dangerous.
264          */
265         if ((ret = __bam_vrfy_inp(dbp,
266             vdp, h, pgno, &pip->entries, flags)) != 0)
267                 goto err;
268
269         if (F_ISSET(pip, VRFY_HAS_DUPS)) {
270                 EPRINT((env, DB_STR_A("1043",
271                     "Page %lu: Recno database has dups",
272                     "%lu"), (u_long)pgno));
273                 ret = DB_VERIFY_BAD;
274                 goto err;
275         }
276
277         /*
278          * Walk through inp and see if the lengths of all the records are the
279          * same--if so, this may be a fixed-length database, and we want to
280          * save off this value.  We know inp to be safe if we've gotten this
281          * far.
282          */
283         re_len_guess = 0;
284         for (i = 0; i < NUM_ENT(h); i++) {
285                 bk = GET_BKEYDATA(dbp, h, i);
286                 /* KEYEMPTY.  Go on. */
287                 if (B_DISSET(bk->type))
288                         continue;
289                 if (bk->type == B_OVERFLOW)
290                         len = ((BOVERFLOW *)bk)->tlen;
291                 else if (bk->type == B_KEYDATA)
292                         len = bk->len;
293                 else {
294                         isbad = 1;
295                         EPRINT((env, DB_STR_A("1044",
296                             "Page %lu: nonsensical type for item %lu",
297                             "%lu %lu"), (u_long)pgno, (u_long)i));
298                         continue;
299                 }
300                 if (re_len_guess == 0)
301                         re_len_guess = len;
302
303                 /*
304                  * Is this item's len the same as the last one's?  If not,
305                  * reset to 0 and break--we don't have a single re_len.
306                  * Otherwise, go on to the next item.
307                  */
308                 if (re_len_guess != len) {
309                         re_len_guess = 0;
310                         break;
311                 }
312         }
313         pip->re_len = re_len_guess;
314
315         /* Save off record count. */
316         pip->rec_cnt = NUM_ENT(h);
317
318 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
319                 ret = t_ret;
320         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
321 }
322
323 /*
324  * __bam_vrfy --
325  *      Verify a btree leaf or internal page.
326  *
327  * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
328  * PUBLIC:     u_int32_t));
329  */
330 int
331 __bam_vrfy(dbp, vdp, h, pgno, flags)
332         DB *dbp;
333         VRFY_DBINFO *vdp;
334         PAGE *h;
335         db_pgno_t pgno;
336         u_int32_t flags;
337 {
338         ENV *env;
339         VRFY_PAGEINFO *pip;
340         int ret, t_ret, isbad;
341
342         env = dbp->env;
343         isbad = 0;
344
345         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
346                 return (ret);
347
348         switch (TYPE(h)) {
349         case P_IBTREE:
350         case P_IRECNO:
351         case P_LBTREE:
352         case P_LDUP:
353                 break;
354         default:
355                 ret = __db_unknown_path(env, "__bam_vrfy");
356                 goto err;
357         }
358
359         /*
360          * Verify (and, if relevant, save off) page fields common to
361          * all PAGEs.
362          */
363         if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
364                 if (ret == DB_VERIFY_BAD)
365                         isbad = 1;
366                 else
367                         goto err;
368         }
369
370         /*
371          * The record count is, on internal pages, stored in an overloaded
372          * next_pgno field.  Save it off;  we'll verify it when we check
373          * overall database structure.  We could overload the field
374          * in VRFY_PAGEINFO, too, but this seems gross, and space
375          * is not at such a premium.
376          */
377         pip->rec_cnt = RE_NREC(h);
378
379         /*
380          * Verify inp[].
381          */
382         if (TYPE(h) == P_IRECNO) {
383                 if ((ret = __ram_vrfy_inp(dbp,
384                     vdp, h, pgno, &pip->entries, flags)) != 0)
385                         goto err;
386         } else if ((ret = __bam_vrfy_inp(dbp,
387             vdp, h, pgno, &pip->entries, flags)) != 0) {
388                 if (ret == DB_VERIFY_BAD)
389                         isbad = 1;
390                 else
391                         goto err;
392                 EPRINT((env, DB_STR_A("1045",
393                     "Page %lu: item order check unsafe: skipping",
394                     "%lu"), (u_long)pgno));
395         } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
396             __bam_vrfy_itemorder(dbp,
397             vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
398                 /*
399                  * We know that the elements of inp are reasonable.
400                  *
401                  * Check that elements fall in the proper order.
402                  */
403                 if (ret == DB_VERIFY_BAD)
404                         isbad = 1;
405                 else
406                         goto err;
407         }
408
409 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
410                 ret = t_ret;
411         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
412 }
413
414 /*
415  * __ram_vrfy_inp --
416  *      Verify that all entries in a P_IRECNO inp[] array are reasonable,
417  *      and count them.  Note that P_LRECNO uses __bam_vrfy_inp;
418  *      P_IRECNOs are a special, and simpler, case, since they have
419  *      RINTERNALs rather than BKEYDATA/BINTERNALs.
420  */
421 static int
422 __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
423         DB *dbp;
424         VRFY_DBINFO *vdp;
425         PAGE *h;
426         db_pgno_t pgno;
427         db_indx_t *nentriesp;
428         u_int32_t flags;
429 {
430         ENV *env;
431         RINTERNAL *ri;
432         VRFY_CHILDINFO child;
433         VRFY_PAGEINFO *pip;
434         int ret, t_ret, isbad;
435         u_int32_t himark, i, offset, nentries;
436         db_indx_t *inp;
437         u_int8_t *pagelayout, *p;
438
439         env = dbp->env;
440         isbad = 0;
441         memset(&child, 0, sizeof(VRFY_CHILDINFO));
442         nentries = 0;
443         pagelayout = NULL;
444
445         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
446                 return (ret);
447
448         if (TYPE(h) != P_IRECNO) {
449                 ret = __db_unknown_path(env, "__ram_vrfy_inp");
450                 goto err;
451         }
452
453         himark = dbp->pgsize;
454         if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
455                 goto err;
456         memset(pagelayout, 0, dbp->pgsize);
457         inp = P_INP(dbp, h);
458         for (i = 0; i < NUM_ENT(h); i++) {
459                 if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
460                         EPRINT((env, DB_STR_A("1046",
461                             "Page %lu: entries listing %lu overlaps data",
462                             "%lu %lu"), (u_long)pgno, (u_long)i));
463                         ret = DB_VERIFY_BAD;
464                         goto err;
465                 }
466                 offset = inp[i];
467                 /*
468                  * Check that the item offset is reasonable:  it points
469                  * somewhere after the inp array and before the end of the
470                  * page.
471                  */
472                 if (offset <= (u_int32_t)((u_int8_t *)inp + i -
473                     (u_int8_t *)h) ||
474                     offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
475                         isbad = 1;
476                         EPRINT((env, DB_STR_A("1047",
477                             "Page %lu: bad offset %lu at index %lu",
478                             "%lu %lu %lu"), (u_long)pgno, (u_long)offset,
479                             (u_long)i));
480                         continue;
481                 }
482
483                 /* Update the high-water mark (what HOFFSET should be) */
484                 if (offset < himark)
485                         himark = offset;
486
487                 nentries++;
488
489                 /* Make sure this RINTERNAL is not multiply referenced. */
490                 ri = GET_RINTERNAL(dbp, h, i);
491                 if (pagelayout[offset] == 0) {
492                         pagelayout[offset] = 1;
493                         child.pgno = ri->pgno;
494                         child.type = V_RECNO;
495                         child.nrecs = ri->nrecs;
496                         if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
497                                 goto err;
498                 } else {
499                         EPRINT((env, DB_STR_A("1048",
500                 "Page %lu: RINTERNAL structure at offset %lu referenced twice",
501                             "%lu %lu"), (u_long)pgno, (u_long)offset));
502                         isbad = 1;
503                 }
504         }
505
506         for (p = pagelayout + himark;
507             p < pagelayout + dbp->pgsize;
508             p += RINTERNAL_SIZE)
509                 if (*p != 1) {
510                         EPRINT((env, DB_STR_A("1049",
511                             "Page %lu: gap between items at offset %lu",
512                             "%lu %lu"), (u_long)pgno,
513                             (u_long)(p - pagelayout)));
514                         isbad = 1;
515                 }
516
517         if ((db_indx_t)himark != HOFFSET(h)) {
518                 EPRINT((env, DB_STR_A("1050",
519                     "Page %lu: bad HOFFSET %lu, appears to be %lu",
520                     "%lu %lu %lu"), (u_long)pgno, (u_long)(HOFFSET(h)),
521                     (u_long)himark));
522                 isbad = 1;
523         }
524
525         *nentriesp = nentries;
526
527 err:    if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
528                 ret = t_ret;
529         if (pagelayout != NULL)
530                 __os_free(env, pagelayout);
531         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
532 }
533
534 typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
535
536 /*
537  * __bam_vrfy_inp --
538  *      Verify that all entries in inp[] array are reasonable;
539  *      count them.
540  */
541 static int
542 __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
543         DB *dbp;
544         VRFY_DBINFO *vdp;
545         PAGE *h;
546         db_pgno_t pgno;
547         db_indx_t *nentriesp;
548         u_int32_t flags;
549 {
550         BKEYDATA *bk;
551         BOVERFLOW *bo;
552         ENV *env;
553         VRFY_CHILDINFO child;
554         VRFY_ITEM *pagelayout;
555         VRFY_PAGEINFO *pip;
556         u_int32_t himark, offset;               /*
557                                                  * These would be db_indx_ts
558                                                  * but for alignment.
559                                                  */
560         u_int32_t i, endoff, nentries;
561         int isbad, initem, isdupitem, ret, t_ret;
562
563         env = dbp->env;
564         isbad = isdupitem = 0;
565         nentries = 0;
566         memset(&child, 0, sizeof(VRFY_CHILDINFO));
567         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
568                 return (ret);
569
570         switch (TYPE(h)) {
571         case P_IBTREE:
572         case P_LBTREE:
573         case P_LDUP:
574         case P_LRECNO:
575                 break;
576         default:
577                 /*
578                  * In the salvager, we might call this from a page which
579                  * we merely suspect is a btree page.  Otherwise, it
580                  * shouldn't get called--if it is, that's a verifier bug.
581                  */
582                 if (LF_ISSET(DB_SALVAGE))
583                         break;
584                 ret = __db_unknown_path(env, "__bam_vrfy_inp");
585                 goto err;
586         }
587
588         /*
589          * Loop through inp[], the array of items, until we either
590          * run out of entries or collide with the data.  Keep track
591          * of h_offset in himark.
592          *
593          * For each element in inp[i], make sure it references a region
594          * that starts after the end of the inp array (as defined by
595          * NUM_ENT(h)), ends before the beginning of the page, doesn't
596          * overlap any other regions, and doesn't have a gap between
597          * it and the region immediately after it.
598          */
599         himark = dbp->pgsize;
600         if ((ret = __os_calloc(
601             env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
602                 goto err;
603         for (i = 0; i < NUM_ENT(h); i++) {
604                 switch (ret = __db_vrfy_inpitem(dbp,
605                     h, pgno, i, 1, flags, &himark, &offset)) {
606                 case 0:
607                         break;
608                 case DB_VERIFY_BAD:
609                         isbad = 1;
610                         continue;
611                 case DB_VERIFY_FATAL:
612                         isbad = 1;
613                         goto err;
614                 default:
615                         DB_ASSERT(env, ret != 0);
616                         break;
617                 }
618
619                 /*
620                  * We now have a plausible beginning for the item, and we know
621                  * its length is safe.
622                  *
623                  * Mark the beginning and end in pagelayout so we can make sure
624                  * items have no overlaps or gaps.
625                  */
626                 bk = GET_BKEYDATA(dbp, h, i);
627                 if (pagelayout[offset] == VRFY_ITEM_NOTSET)
628                         pagelayout[offset] = VRFY_ITEM_BEGIN;
629                 else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
630                         /*
631                          * Having two inp entries that point at the same patch
632                          * of page is legal if and only if the page is
633                          * a btree leaf and they're onpage duplicate keys--
634                          * that is, if (i % P_INDX) == 0.
635                          */
636                         if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
637                                 /* Flag for later. */
638                                 F_SET(pip, VRFY_HAS_DUPS);
639
640                                 /* Bump up nentries so we don't undercount. */
641                                 nentries++;
642
643                                 /*
644                                  * We'll check to make sure the end is
645                                  * equal, too.
646                                  */
647                                 isdupitem = 1;
648                         } else {
649                                 isbad = 1;
650                                 EPRINT((env, DB_STR_A("1051",
651                                     "Page %lu: duplicated item %lu",
652                                     "%lu %lu"), (u_long)pgno, (u_long)i));
653                         }
654                 }
655
656                 /*
657                  * Mark the end.  Its location varies with the page type
658                  * and the item type.
659                  *
660                  * If the end already has a sign other than 0, do nothing--
661                  * it's an overlap that we'll catch later.
662                  */
663                 switch (B_TYPE(bk->type)) {
664                 case B_KEYDATA:
665                         if (TYPE(h) == P_IBTREE)
666                                 /* It's a BINTERNAL. */
667                                 endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
668                         else
669                                 endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
670                         break;
671                 case B_DUPLICATE:
672                         /*
673                          * Flag that we have dups; we'll check whether
674                          * that's okay during the structure check.
675                          */
676                         F_SET(pip, VRFY_HAS_DUPS);
677                         /* FALLTHROUGH */
678                 case B_OVERFLOW:
679                         /*
680                          * Overflow entries on internal pages are stored
681                          * as the _data_ of a BINTERNAL;  overflow entries
682                          * on leaf pages are stored as the entire entry.
683                          */
684                         endoff = offset +
685                             ((TYPE(h) == P_IBTREE) ?
686                             BINTERNAL_SIZE(BOVERFLOW_SIZE) :
687                             BOVERFLOW_SIZE) - 1;
688                         break;
689                 default:
690                         /*
691                          * We'll complain later;  for now, just mark
692                          * a minimum.
693                          */
694                         endoff = offset + BKEYDATA_SIZE(0) - 1;
695                         break;
696                 }
697
698                 /*
699                  * If this is an onpage duplicate key we've seen before,
700                  * the end had better coincide too.
701                  */
702                 if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
703                         EPRINT((env, DB_STR_A("1052",
704                             "Page %lu: duplicated item %lu",
705                             "%lu %lu"), (u_long)pgno, (u_long)i));
706                         isbad = 1;
707                 } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
708                         pagelayout[endoff] = VRFY_ITEM_END;
709                 isdupitem = 0;
710
711                 /*
712                  * There should be no deleted items in a quiescent tree,
713                  * except in recno.
714                  */
715                 if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
716                         isbad = 1;
717                         EPRINT((env, DB_STR_A("1053",
718                             "Page %lu: item %lu marked deleted", "%lu %lu"),
719                             (u_long)pgno, (u_long)i));
720                 }
721
722                 /*
723                  * Check the type and such of bk--make sure it's reasonable
724                  * for the pagetype.
725                  */
726                 switch (B_TYPE(bk->type)) {
727                 case B_KEYDATA:
728                         /*
729                          * This is a normal, non-overflow BKEYDATA or BINTERNAL.
730                          * The only thing to check is the len, and that's
731                          * already been done.
732                          */
733                         break;
734                 case B_DUPLICATE:
735                         if (TYPE(h) == P_IBTREE) {
736                                 isbad = 1;
737                                 EPRINT((env, DB_STR_A("1054",
738     "Page %lu: duplicate page referenced by internal btree page at item %lu",
739                                     "%lu %lu"), (u_long)pgno, (u_long)i));
740                                 break;
741                         } else if (TYPE(h) == P_LRECNO) {
742                                 isbad = 1;
743                                 EPRINT((env, DB_STR_A("1055",
744         "Page %lu: duplicate page referenced by recno page at item %lu",
745                                     "%lu %lu"), (u_long)pgno, (u_long)i));
746                                 break;
747                         }
748                         /* FALLTHROUGH */
749                 case B_OVERFLOW:
750                         bo = (TYPE(h) == P_IBTREE) ?
751                             (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
752                             (BOVERFLOW *)bk;
753
754                         if (B_TYPE(bk->type) == B_OVERFLOW)
755                                 /* Make sure tlen is reasonable. */
756                                 if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
757                                         isbad = 1;
758                                         EPRINT((env, DB_STR_A("1056",
759                                 "Page %lu: impossible tlen %lu, item %lu",
760                                             "%lu %lu %lu"), (u_long)pgno,
761                                             (u_long)bo->tlen, (u_long)i));
762                                         /* Don't save as a child. */
763                                         break;
764                                 }
765
766                         if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
767                             bo->pgno == PGNO_INVALID) {
768                                 isbad = 1;
769                                 EPRINT((env, DB_STR_A("1057",
770                             "Page %lu: offpage item %lu has bad pgno %lu",
771                                     "%lu %lu %lu"), (u_long)pgno, (u_long)i,
772                                     (u_long)bo->pgno));
773                                 /* Don't save as a child. */
774                                 break;
775                         }
776
777                         child.pgno = bo->pgno;
778                         child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
779                             V_OVERFLOW : V_DUPLICATE);
780                         child.tlen = bo->tlen;
781                         if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
782                                 goto err;
783                         break;
784                 default:
785                         isbad = 1;
786                         EPRINT((env, DB_STR_A("1058",
787                             "Page %lu: item %lu of invalid type %lu",
788                             "%lu %lu %lu"), (u_long)pgno, (u_long)i,
789                             (u_long)B_TYPE(bk->type)));
790                         break;
791                 }
792         }
793
794         /*
795          * Now, loop through and make sure the items are contiguous and
796          * non-overlapping.
797          */
798         initem = 0;
799         for (i = himark; i < dbp->pgsize; i++)
800                 if (initem == 0)
801                         switch (pagelayout[i]) {
802                         case VRFY_ITEM_NOTSET:
803                                 /* May be just for alignment. */
804                                 if (i != DB_ALIGN(i, sizeof(u_int32_t)))
805                                         continue;
806
807                                 isbad = 1;
808                                 EPRINT((env, DB_STR_A("1059",
809                                     "Page %lu: gap between items at offset %lu",
810                                     "%lu %lu"), (u_long)pgno, (u_long)i));
811                                 /* Find the end of the gap */
812                                 for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
813                                     (size_t)(i + 1) < dbp->pgsize; i++)
814                                         ;
815                                 break;
816                         case VRFY_ITEM_BEGIN:
817                                 /* We've found an item. Check its alignment. */
818                                 if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
819                                         isbad = 1;
820                                         EPRINT((env, DB_STR_A("1060",
821                                             "Page %lu: offset %lu unaligned",
822                                             "%lu %lu"), (u_long)pgno,
823                                             (u_long)i));
824                                 }
825                                 initem = 1;
826                                 nentries++;
827                                 break;
828                         case VRFY_ITEM_END:
829                                 /*
830                                  * We've hit the end of an item even though
831                                  * we don't think we're in one;  must
832                                  * be an overlap.
833                                  */
834                                 isbad = 1;
835                                 EPRINT((env, DB_STR_A("1061",
836                                     "Page %lu: overlapping items at offset %lu",
837                                     "%lu %lu"), (u_long)pgno, (u_long)i));
838                                 break;
839                         }
840                 else
841                         switch (pagelayout[i]) {
842                         case VRFY_ITEM_NOTSET:
843                                 /* In the middle of an item somewhere. Okay. */
844                                 break;
845                         case VRFY_ITEM_END:
846                                 /* End of an item; switch to out-of-item mode.*/
847                                 initem = 0;
848                                 break;
849                         case VRFY_ITEM_BEGIN:
850                                 /*
851                                  * Hit a second item beginning without an
852                                  * end.  Overlap.
853                                  */
854                                 isbad = 1;
855                                 EPRINT((env, DB_STR_A("1062",
856                                     "Page %lu: overlapping items at offset %lu",
857                                     "%lu %lu"), (u_long)pgno, (u_long)i));
858                                 break;
859                         }
860
861         __os_free(env, pagelayout);
862
863         /* Verify HOFFSET. */
864         if ((db_indx_t)himark != HOFFSET(h)) {
865                 EPRINT((env, DB_STR_A("1063",
866                     "Page %lu: bad HOFFSET %lu, appears to be %lu",
867                     "%lu %lu %lu"), (u_long)pgno, (u_long)HOFFSET(h),
868                     (u_long)himark));
869                 isbad = 1;
870         }
871
872 err:    if (nentriesp != NULL)
873                 *nentriesp = nentries;
874
875         if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
876                 ret = t_ret;
877
878         return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
879 }
880
881 /*
882  * __bam_vrfy_itemorder --
883  *      Make sure the items on a page sort correctly.
884  *
885  *      Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
886  *      reasonable;  be sure that __bam_vrfy_inp has been called first.
887  *
888  *      If ovflok is set, it also assumes that overflow page chains
889  *      hanging off the current page have been sanity-checked, and so we
890  *      can use __bam_cmp to verify their ordering.  If it is not set,
891  *      and we run into an overflow page, carp and return DB_VERIFY_BAD;
892  *      we shouldn't be called if any exist.
893  *
894  * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
895  * PUBLIC:      PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
896  */
897 int
898 __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
899         DB *dbp;
900         VRFY_DBINFO *vdp;
901         DB_THREAD_INFO *ip;
902         PAGE *h;
903         db_pgno_t pgno;
904         u_int32_t nentries;
905         int ovflok, hasdups;
906         u_int32_t flags;
907 {
908         BINTERNAL *bi;
909         BKEYDATA *bk;
910         BOVERFLOW *bo;
911         BTREE *bt;
912         DB_MPOOLFILE *mpf;
913         DBC *dbc;
914         DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
915         ENV *env;
916         PAGE *child;
917         db_pgno_t cpgno;
918         VRFY_PAGEINFO *pip;
919         db_indx_t i, *inp;
920         int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
921         int (*dupfunc) __P((DB *, const DBT *, const DBT *));
922         int (*func) __P((DB *, const DBT *, const DBT *));
923         void *buf1, *buf2, *tmpbuf;
924
925         /*
926          * We need to work in the ORDERCHKONLY environment where we might
927          * not have a pip, but we also may need to work in contexts where
928          * NUM_ENT isn't safe.
929          */
930         if (vdp != NULL) {
931                 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
932                         return (ret);
933                 nentries = pip->entries;
934         } else
935                 pip = NULL;
936
937         env = dbp->env;
938         ret = isbad = 0;
939         bo = NULL;                      /* Shut up compiler. */
940
941         memset(&dbta, 0, sizeof(DBT));
942         F_SET(&dbta, DB_DBT_REALLOC);
943
944         memset(&dbtb, 0, sizeof(DBT));
945         F_SET(&dbtb, DB_DBT_REALLOC);
946
947         buf1 = buf2 = NULL;
948
949         DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
950
951         dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
952         if (TYPE(h) == P_LDUP)
953                 func = dupfunc;
954         else {
955                 func = __bam_defcmp;
956                 if (dbp->bt_internal != NULL) {
957                         bt = (BTREE *)dbp->bt_internal;
958                         if (TYPE(h) == P_IBTREE && (bt->bt_compare != NULL ||
959                             dupfunc != __bam_defcmp)) {
960                                 /*
961                                  * The problem here is that we cannot
962                                  * tell the difference between an off
963                                  * page duplicate internal page and
964                                  * a main database internal page.
965                                  * Walk down the tree to figure it out.
966                                  */
967                                 mpf = dbp->mpf;
968                                 child = h;
969                                 while (TYPE(child) == P_IBTREE) {
970                                         bi = GET_BINTERNAL(dbp, child, 0);
971                                         cpgno = bi->pgno;
972                                         if (child != h &&
973                                             (ret = __memp_fput(mpf,
974                                             vdp->thread_info, child,
975                                             DB_PRIORITY_UNCHANGED)) != 0)
976                                                 goto err;
977                                         if ((ret = __memp_fget(mpf,
978                                             &cpgno, vdp->thread_info,
979                                             NULL, 0, &child)) != 0)
980                                                 goto err;
981                                 }
982                                 if (TYPE(child) == P_LDUP)
983                                         func = dupfunc;
984                                 else if (bt->bt_compare != NULL)
985                                         func = bt->bt_compare;
986                                 if ((ret = __memp_fput(mpf, vdp->thread_info,
987                                     child, DB_PRIORITY_UNCHANGED)) != 0)
988                                         goto err;
989                         } else if (bt->bt_compare != NULL)
990                                 func = bt->bt_compare;
991                 }
992         }
993
994         /*
995          * We alternate our use of dbta and dbtb so that we can walk
996          * through the page key-by-key without copying a dbt twice.
997          * p1 is always the dbt for index i - 1, and p2 for index i.
998          * Reset the data pointers in case we are retrying.
999          */
1000 retry:  p1 = &dbta;
1001         p1->data = NULL;
1002         p2 = &dbtb;
1003         p2->data = NULL;
1004
1005         /*
1006          * Loop through the entries.  nentries ought to contain the
1007          * actual count, and so is a safe way to terminate the loop;  whether
1008          * we inc. by one or two depends on whether we're a leaf page--
1009          * on a leaf page, we care only about keys.  On internal pages
1010          * and LDUP pages, we want to check the order of all entries.
1011          *
1012          * Note that on IBTREE pages or the index page of a partitioned
1013          * database, we start with item 1, since item 0 doesn't get looked
1014          * at by __bam_cmp.
1015          */
1016         inp = P_INP(dbp, h);
1017         adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
1018         for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0;
1019             i < nentries; i += adj) {
1020                 /*
1021                  * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
1022                  */
1023                 tmp = p1;
1024                 p1 = p2;
1025                 p2 = tmp;
1026                 tmpbuf = buf1;
1027                 buf1 = buf2;
1028                 buf2 = tmpbuf;
1029
1030                 /*
1031                  * Get key i into p2.
1032                  */
1033                 switch (TYPE(h)) {
1034                 case P_IBTREE:
1035                         bi = GET_BINTERNAL(dbp, h, i);
1036                         if (B_TYPE(bi->type) == B_OVERFLOW) {
1037                                 bo = (BOVERFLOW *)(bi->data);
1038                                 goto overflow;
1039                         } else {
1040                                 p2->data = bi->data;
1041                                 p2->size = bi->len;
1042                         }
1043
1044                         /*
1045                          * The leftmost key on an internal page must be
1046                          * len 0, since it's just a placeholder and
1047                          * automatically sorts less than all keys.
1048                          *
1049                          * XXX
1050                          * This criterion does not currently hold!
1051                          * See todo list item #1686.  Meanwhile, it's harmless
1052                          * to just not check for it.
1053                          */
1054 #if 0
1055                         if (i == 0 && bi->len != 0) {
1056                                 isbad = 1;
1057                                 EPRINT((env, DB_STR_A("1064",
1058                 "Page %lu: lowest key on internal page of nonzero length",
1059                                     "%lu"), (u_long)pgno));
1060                         }
1061 #endif
1062                         break;
1063                 case P_LBTREE:
1064                 case P_LDUP:
1065                         bk = GET_BKEYDATA(dbp, h, i);
1066                         if (B_TYPE(bk->type) == B_OVERFLOW) {
1067                                 bo = (BOVERFLOW *)bk;
1068                                 goto overflow;
1069                         } else {
1070                                 p2->data = bk->data;
1071                                 p2->size = bk->len;
1072                         }
1073                         break;
1074                 default:
1075                         /*
1076                          * This means our caller screwed up and sent us
1077                          * an inappropriate page.
1078                          */
1079                         ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
1080                         goto err;
1081                 }
1082
1083                 if (0) {
1084                         /*
1085                          * If ovflok != 1, we can't safely go chasing
1086                          * overflow pages with the normal routines now;
1087                          * they might be unsafe or nonexistent.  Mark this
1088                          * page as incomplete and return.
1089                          *
1090                          * Note that we don't need to worry about freeing
1091                          * buffers, since they can't have been allocated
1092                          * if overflow items are unsafe.
1093                          */
1094 overflow:               if (!ovflok) {
1095                                 if (pip != NULL)
1096                                         F_SET(pip, VRFY_INCOMPLETE);
1097                                 goto err;
1098                         }
1099
1100                         /*
1101                          * Overflow items are safe to chase.  Do so.
1102                          * Fetch the overflow item into p2->data,
1103                          * NULLing it or reallocing it as appropriate.
1104                          *
1105                          * (We set p2->data to buf2 before the call
1106                          * so we're sure to realloc if we can and if p2
1107                          * was just pointing at a non-overflow item.)
1108                          */
1109                         p2->data = buf2;
1110                         if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
1111                             PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
1112                                 goto err;
1113                         if ((ret = __db_goff(dbc,
1114                             p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
1115                                 isbad = 1;
1116                                 EPRINT((env, DB_STR_A("1065",
1117                             "Page %lu: error %lu in fetching overflow item %lu",
1118                                     "%lu %lu %lu"), (u_long)pgno, (u_long)ret,
1119                                     (u_long)i));
1120                         }
1121                         /* In case it got realloc'ed and thus changed. */
1122                         buf2 = p2->data;
1123                 }
1124
1125                 /* Compare with the last key. */
1126                 if (p1->data != NULL && p2->data != NULL) {
1127                         cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
1128
1129                         /* comparison succeeded */
1130                         if (cmp > 0) {
1131                                 /*
1132                                  * If we are looking at an internal page, we
1133                                  * don't know whether it is part of the main
1134                                  * database or in an off-page-duplicate tree.
1135                                  * If the main comparator fails, retry with
1136                                  * the duplicate comparator.
1137                                  */
1138                                 if (TYPE(h) == P_IBTREE && func != dupfunc) {
1139                                         func = dupfunc;
1140                                         goto retry;
1141                                 }
1142
1143                                 isbad = 1;
1144                                 EPRINT((env, DB_STR_A("1066",
1145                                     "Page %lu: out-of-order key at entry %lu",
1146                                     "%lu %lu"), (u_long)pgno, (u_long)i));
1147                                 /* proceed */
1148                         } else if (cmp == 0) {
1149                                 if (inp[i] != inp[i - adj]) {
1150                                         /* See above. */
1151                                         if (TYPE(h) == P_IBTREE &&
1152                                             func != dupfunc) {
1153                                                 func = dupfunc;
1154                                                 goto retry;
1155                                         }
1156                                         isbad = 1;
1157                                         EPRINT((env, DB_STR_A("1067",
1158                                      "Page %lu: non-dup dup key at entry %lu",
1159                                            "%lu %lu"), (u_long)pgno,
1160                                             (u_long)i));
1161                                 }
1162                                 /*
1163                                  * If they compared equally, this
1164                                  * had better be a (sub)database with dups.
1165                                  * Mark it so we can check during the
1166                                  * structure check.
1167                                  */
1168                                 if (pip != NULL)
1169                                         F_SET(pip, VRFY_HAS_DUPS);
1170                                 else if (hasdups == 0) {
1171                                         /* See above. */
1172                                         if (TYPE(h) == P_IBTREE &&
1173                                             func != dupfunc) {
1174                                                 func = dupfunc;
1175                                                 goto retry;
1176                                         }
1177                                         isbad = 1;
1178                                         EPRINT((env, DB_STR_A("1068",
1179         "Page %lu: database with no duplicates has duplicated keys",
1180                                             "%lu"), (u_long)pgno));
1181                                 }
1182
1183                                 /*
1184                                  * If we're a btree leaf, check to see
1185                                  * if the data items of these on-page dups are
1186                                  * in sorted order.  If not, flag this, so
1187                                  * that we can make sure during the
1188                                  * structure checks that the DUPSORT flag
1189                                  * is unset.
1190                                  *
1191                                  * At this point i points to a duplicate key.
1192                                  * Compare the datum before it (same key)
1193                                  * to the datum after it, i.e. i-1 to i+1.
1194                                  */
1195                                 if (TYPE(h) == P_LBTREE) {
1196                                         /*
1197                                          * Unsafe;  continue and we'll pick
1198                                          * up the bogus nentries later.
1199                                          */
1200                                         if (i + 1 >= (db_indx_t)nentries)
1201                                                 continue;
1202
1203                                         /*
1204                                          * We don't bother with clever memory
1205                                          * management with on-page dups,
1206                                          * as it's only really a big win
1207                                          * in the overflow case, and overflow
1208                                          * dups are probably (?) rare.
1209                                          */
1210                                         if (((ret = __bam_safe_getdata(dbp,
1211                                             ip, h, i - 1, ovflok,
1212                                             &dup_1, &freedup_1)) != 0) ||
1213                                             ((ret = __bam_safe_getdata(dbp,
1214                                             ip, h, i + 1, ovflok,
1215                                             &dup_2, &freedup_2)) != 0))
1216                                                 goto err;
1217
1218                                         /*
1219                                          * If either of the data are NULL,
1220                                          * it's because they're overflows and
1221                                          * it's not safe to chase them now.
1222                                          * Mark an incomplete and return.
1223                                          */
1224                                         if (dup_1.data == NULL ||
1225                                             dup_2.data == NULL) {
1226                                                 DB_ASSERT(env, !ovflok);
1227                                                 if (pip != NULL)
1228                                                         F_SET(pip,
1229                                                             VRFY_INCOMPLETE);
1230                                                 goto err;
1231                                         }
1232
1233                                         /*
1234                                          * If the dups are out of order,
1235                                          * flag this.  It's not an error
1236                                          * until we do the structure check
1237                                          * and see whether DUPSORT is set.
1238                                          */
1239                                         if (dupfunc(dbp, &dup_1, &dup_2) > 0 &&
1240                                             pip != NULL)
1241                                                 F_SET(pip, VRFY_DUPS_UNSORTED);
1242
1243                                         if (freedup_1)
1244                                                 __os_ufree(env, dup_1.data);
1245                                         if (freedup_2)
1246                                                 __os_ufree(env, dup_2.data);
1247                                 }
1248                         }
1249                 }
1250         }
1251
1252 err:    if (pip != NULL && ((t_ret =
1253             __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
1254                 ret = t_ret;
1255
1256         if (buf1 != NULL)
1257                 __os_ufree(env, buf1);
1258         if (buf2 != NULL)
1259                 __os_ufree(env, buf2);
1260
1261         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
1262 }
1263
1264 /*
1265  * __bam_vrfy_structure --
1266  *      Verify the tree structure of a btree database (including the master
1267  *      database containing subdbs).
1268  *
1269  * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
1270  * PUBLIC:     void *, void *, u_int32_t));
1271  */
1272 int
1273 __bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags)
1274         DB *dbp;
1275         VRFY_DBINFO *vdp;
1276         db_pgno_t meta_pgno;
1277         void *lp, *rp;
1278         u_int32_t flags;
1279 {
1280         DB *pgset;
1281         ENV *env;
1282         VRFY_PAGEINFO *mip, *rip;
1283         db_pgno_t root, p;
1284         int t_ret, ret;
1285         u_int32_t nrecs, level, relen, stflags;
1286
1287         env = dbp->env;
1288         mip = rip = 0;
1289         pgset = vdp->pgset;
1290
1291         if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
1292                 return (ret);
1293
1294         if ((ret = __db_vrfy_pgset_get(pgset,
1295             vdp->thread_info, vdp->txn, meta_pgno, (int *)&p)) != 0)
1296                 goto err;
1297         if (p != 0) {
1298                 EPRINT((env, DB_STR_A("1069",
1299                     "Page %lu: btree metadata page observed twice",
1300                     "%lu"), (u_long)meta_pgno));
1301                 ret = DB_VERIFY_BAD;
1302                 goto err;
1303         }
1304         if ((ret = __db_vrfy_pgset_inc(
1305             pgset, vdp->thread_info, vdp->txn, meta_pgno)) != 0)
1306                 goto err;
1307
1308         root = mip->root;
1309
1310         if (root == 0) {
1311                 EPRINT((env, DB_STR_A("1070",
1312                     "Page %lu: btree metadata page has no root",
1313                     "%lu"), (u_long)meta_pgno));
1314                 ret = DB_VERIFY_BAD;
1315                 goto err;
1316         }
1317
1318         if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
1319                 goto err;
1320
1321         switch (rip->type) {
1322         case P_IBTREE:
1323         case P_LBTREE:
1324                 stflags = flags | DB_ST_TOPLEVEL;
1325                 if (F_ISSET(mip, VRFY_HAS_DUPS))
1326                         stflags |= DB_ST_DUPOK;
1327                 if (F_ISSET(mip, VRFY_HAS_DUPSORT))
1328                         stflags |= DB_ST_DUPSORT;
1329                 if (F_ISSET(mip, VRFY_HAS_RECNUMS))
1330                         stflags |= DB_ST_RECNUM;
1331                 ret = __bam_vrfy_subtree(dbp,
1332                     vdp, root, lp, rp, stflags, NULL, NULL, NULL);
1333                 break;
1334         case P_IRECNO:
1335         case P_LRECNO:
1336                 stflags =
1337                     flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
1338                 if (mip->re_len > 0)
1339                         stflags |= DB_ST_RELEN;
1340                 if ((ret = __bam_vrfy_subtree(dbp, vdp,
1341                     root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
1342                         goto err;
1343                 /*
1344                  * Even if mip->re_len > 0, re_len may come back zero if the
1345                  * tree is empty.  It should be okay to just skip the check in
1346                  * this case, as if there are any non-deleted keys at all,
1347                  * that should never happen.
1348                  */
1349                 if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
1350                         EPRINT((env, DB_STR_A("1071",
1351                             "Page %lu: recno database has bad re_len %lu",
1352                             "%lu %lu"), (u_long)meta_pgno, (u_long)relen));
1353                         ret = DB_VERIFY_BAD;
1354                         goto err;
1355                 }
1356                 ret = 0;
1357                 break;
1358         case P_LDUP:
1359                 EPRINT((env, DB_STR_A("1072",
1360                     "Page %lu: duplicate tree referenced from metadata page",
1361                     "%lu"), (u_long)meta_pgno));
1362                 ret = DB_VERIFY_BAD;
1363                 break;
1364         default:
1365                 EPRINT((env, DB_STR_A("1073",
1366             "Page %lu: btree root of incorrect type %lu on metadata page",
1367                     "%lu %lu"), (u_long)meta_pgno, (u_long)rip->type));
1368                 ret = DB_VERIFY_BAD;
1369                 break;
1370         }
1371
1372 err:    if (mip != NULL && ((t_ret =
1373             __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
1374                 ret = t_ret;
1375         if (rip != NULL && ((t_ret =
1376             __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
1377                 ret = t_ret;
1378         return (ret);
1379 }
1380
1381 /*
1382  * __bam_vrfy_subtree--
1383  *      Verify a subtree (or entire) btree with specified root.
1384  *
1385  *      Note that this is public because it must be called to verify
1386  *      offpage dup trees, including from hash.
1387  *
1388  * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
1389  * PUBLIC:     void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
1390  */
1391 int
1392 __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
1393         DB *dbp;
1394         VRFY_DBINFO *vdp;
1395         db_pgno_t pgno;
1396         void *l, *r;
1397         u_int32_t flags, *levelp, *nrecsp, *relenp;
1398 {
1399         BINTERNAL *li, *ri;
1400         DB *pgset;
1401         DBC *cc;
1402         DB_MPOOLFILE *mpf;
1403         ENV *env;
1404         PAGE *h;
1405         VRFY_CHILDINFO *child;
1406         VRFY_PAGEINFO *pip;
1407         db_indx_t i;
1408         db_pgno_t next_pgno, prev_pgno;
1409         db_recno_t child_nrecs, nrecs;
1410         u_int32_t child_level, child_relen, j, level, relen, stflags;
1411         u_int8_t leaf_type;
1412         int (*func) __P((DB *, const DBT *, const DBT *));
1413         int isbad, p, ret, t_ret, toplevel;
1414
1415         if (levelp != NULL)     /* Don't leave uninitialized on error. */
1416                 *levelp = 0;
1417         if (nrecsp != NULL)
1418                 *nrecsp = 0;
1419
1420         env = dbp->env;
1421         mpf = dbp->mpf;
1422         h = NULL;
1423         next_pgno = prev_pgno = PGNO_INVALID;
1424         nrecs = 0;
1425         relen = 0;
1426         leaf_type = P_INVALID;
1427         isbad = ret = 0;
1428
1429         /* Provide feedback on our progress to the application. */
1430         if (!LF_ISSET(DB_SALVAGE))
1431                 __db_vrfy_struct_feedback(dbp, vdp);
1432
1433         if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
1434                 return (ret);
1435
1436         cc = NULL;
1437         level = pip->bt_level;
1438
1439         toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
1440         LF_CLR(DB_ST_TOPLEVEL);
1441
1442         /*
1443          * If this is the root, initialize the vdp's prev- and next-pgno
1444          * accounting.
1445          *
1446          * For each leaf page we hit, we'll want to make sure that
1447          * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
1448          * our page number.  Then, we'll set vdp->next_pgno to pip->next_pgno
1449          * and vdp->prev_pgno to our page number, and the next leaf page in
1450          * line should be able to do the same verification.
1451          */
1452         if (toplevel) {
1453                 /*
1454                  * Cache the values stored in the vdp so that if we're an
1455                  * auxiliary tree such as an off-page duplicate set, our
1456                  * caller's leaf page chain doesn't get lost.
1457                  */
1458                 prev_pgno = vdp->prev_pgno;
1459                 next_pgno = vdp->next_pgno;
1460                 leaf_type = vdp->leaf_type;
1461                 vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
1462                 vdp->leaf_type = P_INVALID;
1463         }
1464
1465         /*
1466          * We are recursively descending a btree, starting from the root
1467          * and working our way out to the leaves.
1468          *
1469          * There are four cases we need to deal with:
1470          *      1. pgno is a recno leaf page.  Any children are overflows.
1471          *      2. pgno is a duplicate leaf page.  Any children
1472          *         are overflow pages;  traverse them, and then return
1473          *         level and nrecs.
1474          *      3. pgno is an ordinary leaf page.  Check whether dups are
1475          *         allowed, and if so, traverse any off-page dups or
1476          *         overflows.  Then return nrecs and level.
1477          *      4. pgno is a recno internal page.  Recursively check any
1478          *         child pages, making sure their levels are one lower
1479          *         and their nrecs sum to ours.
1480          *      5. pgno is a btree internal page.  Same as #4, plus we
1481          *         must verify that for each pair of BINTERNAL entries
1482          *         N and N+1, the leftmost item on N's child sorts
1483          *         greater than N, and the rightmost item on N's child
1484          *         sorts less than N+1.
1485          *
1486          * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
1487          * we need to verify the internal sort order is correct if,
1488          * due to overflow items, we were not able to do so earlier.
1489          */
1490         switch (pip->type) {
1491         case P_LRECNO:
1492         case P_LDUP:
1493         case P_LBTREE:
1494                 /*
1495                  * Cases 1, 2 and 3.
1496                  *
1497                  * We're some sort of leaf page;  verify
1498                  * that our linked list of leaves is consistent.
1499                  */
1500                 if (vdp->leaf_type == P_INVALID) {
1501                         /*
1502                          * First leaf page.  Set the type that all its
1503                          * successors should be, and verify that our prev_pgno
1504                          * is PGNO_INVALID.
1505                          */
1506                         vdp->leaf_type = pip->type;
1507                         if (pip->prev_pgno != PGNO_INVALID)
1508                                 goto bad_prev;
1509                 } else {
1510                         /*
1511                          * Successor leaf page. Check our type, the previous
1512                          * page's next_pgno, and our prev_pgno.
1513                          */
1514                         if (pip->type != vdp->leaf_type) {
1515                                 isbad = 1;
1516                                 EPRINT((env, DB_STR_A("1074",
1517         "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
1518                                     "%lu %lu %lu"), (u_long)pip->pgno,
1519                                     (u_long)pip->type,
1520                                     (u_long)vdp->leaf_type));
1521                         }
1522
1523                         /*
1524                          * Don't do the prev/next_pgno checks if we've lost
1525                          * leaf pages due to another corruption.
1526                          */
1527                         if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
1528                                 if (pip->pgno != vdp->next_pgno) {
1529                                         isbad = 1;
1530                                         EPRINT((env, DB_STR_A("1075",
1531         "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
1532                                             "%lu %lu %lu"),
1533                                             (u_long)vdp->prev_pgno,
1534                                             (u_long)vdp->next_pgno,
1535                                             (u_long)pip->pgno));
1536                                 }
1537                                 if (pip->prev_pgno != vdp->prev_pgno) {
1538 bad_prev:                               isbad = 1;
1539                                         EPRINT((env, DB_STR_A("1076",
1540     "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
1541                                             "%lu %lu %lu"),
1542                                             (u_long)pip->pgno,
1543                                             (u_long)pip->prev_pgno,
1544                                             (u_long)vdp->prev_pgno));
1545                                 }
1546                         }
1547                 }
1548                 vdp->prev_pgno = pip->pgno;
1549                 vdp->next_pgno = pip->next_pgno;
1550                 F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
1551
1552                 /*
1553                  * Overflow pages are common to all three leaf types;
1554                  * traverse the child list, looking for overflows.
1555                  */
1556                 if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1557                         goto err;
1558                 for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1559                     ret = __db_vrfy_ccnext(cc, &child))
1560                         if (child->type == V_OVERFLOW &&
1561                             (ret = __db_vrfy_ovfl_structure(dbp, vdp,
1562                             child->pgno, child->tlen,
1563                             flags | DB_ST_OVFL_LEAF)) != 0) {
1564                                 if (ret == DB_VERIFY_BAD)
1565                                         isbad = 1;
1566                                 else
1567                                         goto done;
1568                         }
1569
1570                 if ((ret = __db_vrfy_ccclose(cc)) != 0)
1571                         goto err;
1572                 cc = NULL;
1573
1574                 /* Case 1 */
1575                 if (pip->type == P_LRECNO) {
1576                         if (!LF_ISSET(DB_ST_IS_RECNO) &&
1577                             !(LF_ISSET(DB_ST_DUPOK) &&
1578                             !LF_ISSET(DB_ST_DUPSORT))) {
1579                                 isbad = 1;
1580                                 EPRINT((env, DB_STR_A("1077",
1581                                     "Page %lu: recno leaf page non-recno tree",
1582                                     "%lu"), (u_long)pgno));
1583                                 goto done;
1584                         }
1585                         goto leaf;
1586                 } else if (LF_ISSET(DB_ST_IS_RECNO)) {
1587                         /*
1588                          * It's a non-recno leaf.  Had better not be a recno
1589                          * subtree.
1590                          */
1591                         isbad = 1;
1592                         EPRINT((env, DB_STR_A("1078",
1593                             "Page %lu: non-recno leaf page in recno tree",
1594                             "%lu"), (u_long)pgno));
1595                         goto done;
1596                 }
1597
1598                 /* Case 2--no more work. */
1599                 if (pip->type == P_LDUP)
1600                         goto leaf;
1601
1602                 /* Case 3 */
1603
1604                 /* Check if we have any dups. */
1605                 if (F_ISSET(pip, VRFY_HAS_DUPS)) {
1606                         /* If dups aren't allowed in this btree, trouble. */
1607                         if (!LF_ISSET(DB_ST_DUPOK)) {
1608                                 isbad = 1;
1609                                 EPRINT((env, DB_STR_A("1079",
1610                                     "Page %lu: duplicates in non-dup btree",
1611                                     "%lu"), (u_long)pgno));
1612                         } else {
1613                                 /*
1614                                  * We correctly have dups.  If any are off-page,
1615                                  * traverse those btrees recursively.
1616                                  */
1617                                 if ((ret =
1618                                     __db_vrfy_childcursor(vdp, &cc)) != 0)
1619                                         goto err;
1620                                 for (ret = __db_vrfy_ccset(cc, pgno, &child);
1621                                     ret == 0;
1622                                     ret = __db_vrfy_ccnext(cc, &child)) {
1623                                         stflags =
1624                                             flags | DB_ST_RECNUM | DB_ST_DUPSET;
1625                                         /* Skip any overflow entries. */
1626                                         if (child->type == V_DUPLICATE) {
1627                                                 if ((ret = __db_vrfy_duptype(
1628                                                     dbp, vdp, child->pgno,
1629                                                     stflags)) != 0) {
1630                                                         isbad = 1;
1631                                                         /* Next child. */
1632                                                         continue;
1633                                                 }
1634                                                 if ((ret = __bam_vrfy_subtree(
1635                                                     dbp, vdp, child->pgno,
1636                                                     NULL, NULL,
1637                                                     stflags | DB_ST_TOPLEVEL,
1638                                                     NULL, NULL, NULL)) != 0) {
1639                                                         if (ret ==
1640                                                             DB_VERIFY_BAD)
1641                                                                 isbad = 1;
1642                                                         else
1643                                                                 goto err;
1644                                                 }
1645                                         }
1646                                 }
1647
1648                                 if ((ret = __db_vrfy_ccclose(cc)) != 0)
1649                                         goto err;
1650                                 cc = NULL;
1651
1652                                 /*
1653                                  * If VRFY_DUPS_UNSORTED is set,
1654                                  * DB_ST_DUPSORT had better not be.
1655                                  */
1656                                 if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
1657                                     LF_ISSET(DB_ST_DUPSORT)) {
1658                                         isbad = 1;
1659                                         EPRINT((env, DB_STR_A("1080",
1660                     "Page %lu: unsorted duplicate set in sorted-dup database",
1661                                             "%lu"), (u_long)pgno));
1662                                 }
1663                         }
1664                 }
1665                 goto leaf;
1666         case P_IBTREE:
1667         case P_IRECNO:
1668                 /* We handle these below. */
1669                 break;
1670         default:
1671                 /*
1672                  * If a P_IBTREE or P_IRECNO contains a reference to an
1673                  * invalid page, we'll wind up here;  handle it gracefully.
1674                  * Note that the code at the "done" label assumes that the
1675                  * current page is a btree/recno one of some sort;  this
1676                  * is not the case here, so we goto err.
1677                  *
1678                  * If the page is entirely zeroed, its pip->type will be a lie
1679                  * (we assumed it was a hash page, as they're allowed to be
1680                  * zeroed);  handle this case specially.
1681                  */
1682                 if (F_ISSET(pip, VRFY_IS_ALLZEROES))
1683                         ZEROPG_ERR_PRINT(env, pgno, DB_STR_P(
1684                             "btree or recno page"));
1685                 else
1686                         EPRINT((env, DB_STR_A("1081",
1687             "Page %lu: btree or recno page is of inappropriate type %lu",
1688                             "%lu %lu"), (u_long)pgno, (u_long)pip->type));
1689
1690                 /*
1691                  * We probably lost a leaf page (or more if this was an
1692                  * internal page) from our prev/next_pgno chain.  Flag
1693                  * that this is expected;  we don't want or need to
1694                  * spew error messages about erroneous prev/next_pgnos,
1695                  * since that's probably not the real problem.
1696                  */
1697                 F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
1698
1699                 ret = DB_VERIFY_BAD;
1700                 goto err;
1701         }
1702
1703         /*
1704          * Cases 4 & 5: This is a btree or recno internal page.  For each child,
1705          * recurse, keeping a running count of nrecs and making sure the level
1706          * is always reasonable.
1707          */
1708         if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1709                 goto err;
1710         for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1711             ret = __db_vrfy_ccnext(cc, &child))
1712                 if (child->type == V_RECNO) {
1713                         if (pip->type != P_IRECNO) {
1714                                 ret = __db_unknown_path(
1715                                     env, "__bam_vrfy_subtree");
1716                                 goto err;
1717                         }
1718                         if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
1719                             NULL, NULL, flags, &child_level, &child_nrecs,
1720                             &child_relen)) != 0) {
1721                                 if (ret == DB_VERIFY_BAD)
1722                                         isbad = 1;
1723                                 else
1724                                         goto done;
1725                         }
1726
1727                         if (LF_ISSET(DB_ST_RELEN)) {
1728                                 if (relen == 0)
1729                                         relen = child_relen;
1730                                 /*
1731                                  * child_relen may be zero if the child subtree
1732                                  * is empty.
1733                                  */
1734                                 else if (child_relen > 0 &&
1735                                     relen != child_relen) {
1736                                         isbad = 1;
1737                                         EPRINT((env, DB_STR_A("1082",
1738                            "Page %lu: recno page returned bad re_len %lu",
1739                                             "%lu %lu"), (u_long)child->pgno,
1740                                             (u_long)child_relen));
1741                                 }
1742                                 if (relenp)
1743                                         *relenp = relen;
1744                         }
1745                         if (LF_ISSET(DB_ST_RECNUM)) {
1746                                 if (child->nrecs != child_nrecs) {
1747                                         isbad = 1;
1748                                         EPRINT((env, DB_STR_A("1083",
1749                 "Page %lu: record count incorrect: actual %lu, in record %lu",
1750                                             "%lu %lu %lu"),
1751                                             (u_long)child->pgno,
1752                                             (u_long)child_nrecs,
1753                                             (u_long)child->nrecs));
1754                                 }
1755                                 nrecs += child_nrecs;
1756                         }
1757                         if (isbad == 0 && level != child_level + 1) {
1758                                 isbad = 1;
1759                                 EPRINT((env, DB_STR_A("1084",
1760                 "Page %lu: recno level incorrect: got %lu, expected %lu",
1761                                     "%lu %lu %lu"),
1762                                     (u_long)child->pgno, (u_long)child_level,
1763                                     (u_long)(level - 1)));
1764                         }
1765                 } else if (child->type == V_OVERFLOW) {
1766                         /*
1767                          * It is possible for one internal page to reference
1768                          * a single overflow page twice, if all the items
1769                          * in the subtree referenced by slot 0 are deleted,
1770                          * then a similar number of items are put back
1771                          * before the key that formerly had been in slot 1.
1772                          *
1773                          * (Btree doesn't look at the key in slot 0, so the
1774                          * fact that the key formerly at slot 1 is the "wrong"
1775                          * parent of the stuff in the slot 0 subtree isn't
1776                          * really incorrect.)
1777                          *
1778                          * __db_vrfy_ovfl_structure is designed to be
1779                          * efficiently called multiple times for multiple
1780                          * references;  call it here as many times as is
1781                          * appropriate.
1782                          */
1783
1784                         /* Otherwise, __db_vrfy_childput would be broken. */
1785                         DB_ASSERT(env, child->refcnt >= 1);
1786
1787                         /*
1788                          * An overflow referenced more than twice here
1789                          * shouldn't happen.
1790                          */
1791                         if (child->refcnt > 2) {
1792                                 isbad = 1;
1793                                 EPRINT((env, DB_STR_A("1085",
1794     "Page %lu: overflow page %lu referenced more than twice from internal page",
1795                                     "%lu %lu"), (u_long)pgno,
1796                                     (u_long)child->pgno));
1797                         } else
1798                                 for (j = 0; j < child->refcnt; j++)
1799                                         if ((ret = __db_vrfy_ovfl_structure(dbp,
1800                                             vdp, child->pgno, child->tlen,
1801                                             flags)) != 0) {
1802                                                 if (ret == DB_VERIFY_BAD)
1803                                                         isbad = 1;
1804                                                 else
1805                                                         goto done;
1806                                         }
1807                 }
1808
1809         if ((ret = __db_vrfy_ccclose(cc)) != 0)
1810                 goto err;
1811         cc = NULL;
1812
1813         /* We're done with case 4. */
1814         if (pip->type == P_IRECNO)
1815                 goto done;
1816
1817         /*
1818          * Case 5.  Btree internal pages.
1819          * As described above, we need to iterate through all the
1820          * items on the page and make sure that our children sort appropriately
1821          * with respect to them.
1822          *
1823          * For each entry, li will be the "left-hand" key for the entry
1824          * itself, which must sort lower than all entries on its child;
1825          * ri will be the key to its right, which must sort greater.
1826          */
1827         if (h == NULL &&
1828             (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
1829                 goto err;
1830         for (i = 0; i < pip->entries; i += O_INDX) {
1831                 li = GET_BINTERNAL(dbp, h, i);
1832                 ri = (i + O_INDX < pip->entries) ?
1833                     GET_BINTERNAL(dbp, h, i + O_INDX) : r;
1834
1835                 /*
1836                  * The leftmost key is forcibly sorted less than all entries,
1837                  * so don't bother passing it.
1838                  */
1839                 if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
1840                     i == 0 ? NULL : li, ri, flags, &child_level,
1841                     &child_nrecs, NULL)) != 0) {
1842                         if (ret == DB_VERIFY_BAD)
1843                                 isbad = 1;
1844                         else
1845                                 goto done;
1846                 }
1847
1848                 if (LF_ISSET(DB_ST_RECNUM)) {
1849                         /*
1850                          * Keep a running tally on the actual record count so
1851                          * we can return it to our parent (if we have one) or
1852                          * compare it to the NRECS field if we're a root page.
1853                          */
1854                         nrecs += child_nrecs;
1855
1856                         /*
1857                          * Make sure the actual record count of the child
1858                          * is equal to the value in the BINTERNAL structure.
1859                          */
1860                         if (li->nrecs != child_nrecs) {
1861                                 isbad = 1;
1862                                 EPRINT((env, DB_STR_A("1086",
1863         "Page %lu: item %lu has incorrect record count of %lu, should be %lu",
1864                                     "%lu %lu %lu %lu"), (u_long)pgno,
1865                                     (u_long)i, (u_long)li->nrecs,
1866                                     (u_long)child_nrecs));
1867                         }
1868                 }
1869
1870                 if (level != child_level + 1) {
1871                         isbad = 1;
1872                         EPRINT((env, DB_STR_A("1087",
1873                 "Page %lu: Btree level incorrect: got %lu, expected %lu",
1874                             "%lu %lu %lu"), (u_long)li->pgno,
1875                             (u_long)child_level, (u_long)(level - 1)));
1876                 }
1877         }
1878
1879         if (0) {
1880 leaf:           level = LEAFLEVEL;
1881                 if (LF_ISSET(DB_ST_RECNUM))
1882                         nrecs = pip->rec_cnt;
1883
1884                 /* XXX
1885                  * We should verify that the record count on a leaf page
1886                  * is the sum of the number of keys and the number of
1887                  * records in its off-page dups.  This requires looking
1888                  * at the page again, however, and it may all be changing
1889                  * soon, so for now we don't bother.
1890                  */
1891
1892                 if (LF_ISSET(DB_ST_RELEN) && relenp)
1893                         *relenp = pip->re_len;
1894         }
1895 done:   if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
1896                 /*
1897                  * During the page-by-page pass, item order verification was
1898                  * not finished due to the presence of overflow items.  If
1899                  * isbad == 0, though, it's now safe to do so, as we've
1900                  * traversed any child overflow pages.  Do it.
1901                  */
1902                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1903                     vdp->thread_info, NULL, 0, &h)) != 0)
1904                         goto err;
1905                 if ((ret = __bam_vrfy_itemorder(dbp,
1906                     vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
1907                         goto err;
1908                 F_CLR(pip, VRFY_INCOMPLETE);
1909         }
1910
1911         /*
1912          * It's possible to get to this point with a page that has no
1913          * items, but without having detected any sort of failure yet.
1914          * Having zero items is legal if it's a leaf--it may be the
1915          * root page in an empty tree, or the tree may have been
1916          * modified with the DB_REVSPLITOFF flag set (there's no way
1917          * to tell from what's on disk).  For an internal page,
1918          * though, having no items is a problem (all internal pages
1919          * must have children).
1920          */
1921         if (isbad == 0 && ret == 0) {
1922                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1923                     vdp->thread_info, NULL, 0, &h)) != 0)
1924                         goto err;
1925
1926                 if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
1927                         isbad = 1;
1928                         EPRINT((env, DB_STR_A("1088",
1929                     "Page %lu: internal page is empty and should not be",
1930                             "%lu"), (u_long)pgno));
1931                         goto err;
1932                 }
1933         }
1934
1935         /*
1936          * Our parent has sent us BINTERNAL pointers to parent records
1937          * so that we can verify our place with respect to them.  If it's
1938          * appropriate--we have a default sort function--verify this.
1939          */
1940         if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
1941             pip->type != P_IRECNO && pip->type != P_LRECNO) {
1942                 if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1943                     vdp->thread_info, NULL, 0, &h)) != 0)
1944                         goto err;
1945
1946                 /*
1947                  * __bam_vrfy_treeorder needs to know what comparison function
1948                  * to use.  If DB_ST_DUPSET is set, we're in a duplicate tree
1949                  * and we use the duplicate comparison function;  otherwise,
1950                  * use the btree one.  If unset, use the default, of course.
1951                  */
1952                 func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
1953                     ((BTREE *)dbp->bt_internal)->bt_compare;
1954                 if (func == NULL)
1955                         func = __bam_defcmp;
1956
1957                 if ((ret = __bam_vrfy_treeorder(dbp,
1958                     vdp->thread_info, h, l, r, func, flags)) != 0) {
1959                         if (ret == DB_VERIFY_BAD)
1960                                 isbad = 1;
1961                         else
1962                                 goto err;
1963                 }
1964         }
1965
1966         /*
1967          * This is guaranteed to succeed for leaf pages, but no harm done.
1968          *
1969          * Internal pages below the top level do not store their own
1970          * record numbers, so we skip them.
1971          */
1972         if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
1973                 isbad = 1;
1974                 EPRINT((env, DB_STR_A("1089",
1975                     "Page %lu: bad record count: has %lu records, claims %lu",
1976                     "%lu %lu %lu"), (u_long)pgno, (u_long)nrecs,
1977                     (u_long)pip->rec_cnt));
1978         }
1979
1980         if (levelp)
1981                 *levelp = level;
1982         if (nrecsp)
1983                 *nrecsp = nrecs;
1984
1985         pgset = vdp->pgset;
1986         if ((ret = __db_vrfy_pgset_get(pgset,
1987             vdp->thread_info, vdp->txn, pgno, &p)) != 0)
1988                 goto err;
1989         if (p != 0) {
1990                 isbad = 1;
1991                 EPRINT((env, DB_STR_A("1090",
1992                     "Page %lu: linked twice", "%lu"), (u_long)pgno));
1993         } else if ((ret =
1994             __db_vrfy_pgset_inc(pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
1995                 goto err;
1996
1997         if (toplevel)
1998                 /*
1999                  * The last page's next_pgno in the leaf chain should have been
2000                  * PGNO_INVALID.
2001                  */
2002                 if (vdp->next_pgno != PGNO_INVALID) {
2003                         isbad = 1;
2004                         EPRINT((env, DB_STR_A("1091",
2005                             "Page %lu: unterminated leaf chain",
2006                             "%lu"), (u_long)vdp->prev_pgno));
2007                 }
2008
2009 err:    if (toplevel) {
2010                 /* Restore our caller's settings. */
2011                 vdp->next_pgno = next_pgno;
2012                 vdp->prev_pgno = prev_pgno;
2013                 vdp->leaf_type = leaf_type;
2014         }
2015
2016         if (h != NULL && (t_ret = __memp_fput(mpf,
2017             vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
2018                 ret = t_ret;
2019         if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
2020                 ret = t_ret;
2021         if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
2022                 ret = t_ret;
2023         return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
2024 }
2025
2026 /*
2027  * __bam_vrfy_treeorder --
2028  *      Verify that the lowest key on a page sorts greater than the
2029  *      BINTERNAL which points to it (lp), and the highest key
2030  *      sorts less than the BINTERNAL above that (rp).
2031  *
2032  *      If lp is NULL, this means that it was the leftmost key on the
2033  *      parent, which (regardless of sort function) sorts less than
2034  *      all keys.  No need to check it.
2035  *
2036  *      If rp is NULL, lp was the highest key on the parent, so there's
2037  *      no higher key we must sort less than.
2038  */
2039 static int
2040 __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
2041         DB *dbp;
2042         DB_THREAD_INFO *ip;
2043         PAGE *h;
2044         BINTERNAL *lp, *rp;
2045         int (*func) __P((DB *, const DBT *, const DBT *));
2046         u_int32_t flags;
2047 {
2048         BOVERFLOW *bo;
2049         DBC *dbc;
2050         DBT dbt;
2051         ENV *env;
2052         db_indx_t last;
2053         int ret, cmp;
2054
2055         env = dbp->env;
2056         memset(&dbt, 0, sizeof(DBT));
2057         F_SET(&dbt, DB_DBT_MALLOC);
2058         ret = 0;
2059
2060         /*
2061          * Empty pages are sorted correctly by definition.  We check
2062          * to see whether they ought to be empty elsewhere;  leaf
2063          * pages legally may be.
2064          */
2065         if (NUM_ENT(h) == 0)
2066                 return (0);
2067
2068         switch (TYPE(h)) {
2069         case P_IBTREE:
2070         case P_LDUP:
2071                 last = NUM_ENT(h) - O_INDX;
2072                 break;
2073         case P_LBTREE:
2074                 last = NUM_ENT(h) - P_INDX;
2075                 break;
2076         default:
2077                 return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
2078         }
2079
2080         /* Populate a dummy cursor. */
2081         if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2082             PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2083                 return (ret);
2084         /*
2085          * The key on page h, the child page, is more likely to be
2086          * an overflow page, so we pass its offset, rather than lp/rp's,
2087          * into __bam_cmp.  This will take advantage of __db_moff.
2088          */
2089
2090         /*
2091          * Skip first-item check if we're an internal page--the first
2092          * entry on an internal page is treated specially by __bam_cmp,
2093          * so what's on the page shouldn't matter.  (Plus, since we're passing
2094          * our page and item 0 as to __bam_cmp, we'll sort before our
2095          * parent and falsely report a failure.)
2096          */
2097         if (lp != NULL && TYPE(h) != P_IBTREE) {
2098                 if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2099                     PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2100                         return (ret);
2101                 if (lp->type == B_KEYDATA) {
2102                         dbt.data = lp->data;
2103                         dbt.size = lp->len;
2104                 } else if (lp->type == B_OVERFLOW) {
2105                         bo = (BOVERFLOW *)lp->data;
2106                         if ((ret = __db_goff(dbc, &dbt,
2107                             bo->tlen, bo->pgno, NULL, NULL)) != 0)
2108                                 return (ret);
2109                 } else
2110                         return (
2111                             __db_unknown_path(env, "__bam_vrfy_treeorder"));
2112
2113                 /* On error, fall through, free if needed, and return. */
2114                 if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
2115                         if (cmp > 0) {
2116                                 EPRINT((env, DB_STR_A("1092",
2117             "Page %lu: first item on page sorted greater than parent entry",
2118                                     "%lu"), (u_long)PGNO(h)));
2119                                 ret = DB_VERIFY_BAD;
2120                         }
2121                 } else
2122                         EPRINT((env, DB_STR_A("1093",
2123                             "Page %lu: first item on page had comparison error",
2124                             "%lu"), (u_long)PGNO(h)));
2125
2126                 if (dbt.data != lp->data)
2127                         __os_ufree(env, dbt.data);
2128                 if (ret != 0)
2129                         return (ret);
2130         }
2131
2132         if (rp != NULL) {
2133                 if (rp->type == B_KEYDATA) {
2134                         dbt.data = rp->data;
2135                         dbt.size = rp->len;
2136                 } else if (rp->type == B_OVERFLOW) {
2137                         bo = (BOVERFLOW *)rp->data;
2138                         if ((ret = __db_goff(dbc, &dbt,
2139                             bo->tlen, bo->pgno, NULL, NULL)) != 0)
2140                                 return (ret);
2141                 } else
2142                         return (
2143                             __db_unknown_path(env, "__bam_vrfy_treeorder"));
2144
2145                 /* On error, fall through, free if needed, and return. */
2146                 if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
2147                         if (cmp < 0) {
2148                                 EPRINT((env, DB_STR_A("1094",
2149             "Page %lu: last item on page sorted greater than parent entry",
2150                                     "%lu"), (u_long)PGNO(h)));
2151                                 ret = DB_VERIFY_BAD;
2152                         }
2153                 } else
2154                         EPRINT((env, DB_STR_A("1095",
2155                             "Page %lu: last item on page had comparison error",
2156                             "%lu"), (u_long)PGNO(h)));
2157
2158                 if (dbt.data != rp->data)
2159                         __os_ufree(env, dbt.data);
2160         }
2161
2162         return (ret);
2163 }
2164
2165 /*
2166  * __bam_salvage --
2167  *      Safely dump out anything that looks like a key on an alleged
2168  *      btree leaf page, also mark overflow pages as seen.  For internal btree
2169  *      pages, just mark any overflow pages as seen.
2170  *
2171  * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
2172  * PUBLIC:     db_pgno_t, u_int32_t, PAGE *, void *,
2173  * PUBLIC:     int (*)(void *, const void *), DBT *, u_int32_t));
2174  */
2175 int
2176 __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
2177         DB *dbp;
2178         VRFY_DBINFO *vdp;
2179         db_pgno_t pgno;
2180         u_int32_t pgtype;
2181         PAGE *h;
2182         void *handle;
2183         int (*callback) __P((void *, const void *));
2184         DBT *key;
2185         u_int32_t flags;
2186 {
2187         BKEYDATA *bk;
2188         BOVERFLOW *bo;
2189         DBT dbt, repldbt, unknown_key, unknown_data;
2190         ENV *env;
2191         VRFY_ITEM *pgmap;
2192         db_indx_t i, last, beg, end, *inp;
2193         db_pgno_t ovflpg;
2194         u_int32_t himark, ovfl_bufsz;
2195         void *ovflbuf;
2196         int adj, ret, t_ret, t2_ret;
2197 #ifdef HAVE_COMPRESSION
2198         DBT kcpy, *last_key;
2199         int unknown_dup_key;
2200 #endif
2201
2202         env = dbp->env;
2203         ovflbuf = pgmap = NULL;
2204         inp = P_INP(dbp, h);
2205
2206         memset(&dbt, 0, sizeof(DBT));
2207         dbt.flags = DB_DBT_REALLOC;
2208         memset(&repldbt, 0, sizeof(DBT));
2209
2210 #ifdef HAVE_COMPRESSION
2211         memset(&kcpy, 0, sizeof(DBT));
2212         unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY);
2213         last_key = unknown_dup_key ? NULL : key;
2214 #endif
2215         LF_CLR(DB_SA_UNKNOWNKEY);
2216
2217         DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
2218         DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
2219
2220         /*
2221          * Allocate a buffer for overflow items.  Start at one page;
2222          * __db_safe_goff will realloc as needed.
2223          */
2224         if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
2225                 goto err;
2226         ovfl_bufsz = dbp->pgsize;
2227
2228         if (LF_ISSET(DB_AGGRESSIVE) && (ret =
2229             __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
2230                 goto err;
2231
2232         /*
2233          * Loop through the inp array, spitting out key/data pairs.
2234          *
2235          * If we're salvaging normally, loop from 0 through NUM_ENT(h).  If
2236          * we're being aggressive, loop until we hit the end of the page --
2237          * NUM_ENT() may be bogus.
2238          */
2239         himark = dbp->pgsize;
2240         for (i = 0, last = UINT16_MAX;; i += O_INDX) {
2241                 /*
2242                  * If we're not aggressive, or if we're on an internal page,
2243                  * break when we hit NUM_ENT(h).
2244                  */
2245                 if ((!LF_ISSET(DB_AGGRESSIVE) ||
2246                     pgtype == P_IBTREE) && i >= NUM_ENT(h))
2247                         break;
2248
2249                 /* Verify the current item. */
2250                 t_ret =
2251                     __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
2252
2253                 if (t_ret != 0) {
2254                         /*
2255                          * If this is a btree leaf and we've printed out a key
2256                          * but not its associated data item, fix this imbalance
2257                          * by printing an "UNKNOWN_DATA".
2258                          */
2259                         if (pgtype == P_LBTREE && i % P_INDX == 1 &&
2260                             last == i - 1 && (t2_ret = __db_vrfy_prdbt(
2261                             &unknown_data,
2262                             0, " ", handle, callback, 0, 0, vdp)) != 0) {
2263                                 if (ret == 0)
2264                                         ret = t2_ret;
2265                                 goto err;
2266                         }
2267
2268                         /*
2269                          * Don't return DB_VERIFY_FATAL; it's private and means
2270                          * only that we can't go on with this page, not with
2271                          * the whole database.  It's not even an error if we've
2272                          * run into it after NUM_ENT(h).
2273                          */
2274                         if (t_ret == DB_VERIFY_FATAL) {
2275                                 if (i < NUM_ENT(h) && ret == 0)
2276                                         ret = DB_VERIFY_BAD;
2277                                 break;
2278                         }
2279                         continue;
2280                 }
2281
2282                 /*
2283                  * If this returned 0, it's safe to print or (carefully)
2284                  * try to fetch.
2285                  *
2286                  * We only print deleted items if DB_AGGRESSIVE is set.
2287                  */
2288                 bk = GET_BKEYDATA(dbp, h, i);
2289                 if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
2290                         continue;
2291
2292                 /*
2293                  * If this is a btree leaf and we're about to print out a data
2294                  * item for which we didn't print out a key, fix this imbalance
2295                  * by printing an "UNKNOWN_KEY".
2296                  */
2297                 if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) {
2298 #ifdef HAVE_COMPRESSION
2299                         last_key = NULL;
2300 #endif
2301                         if ((t_ret = __db_vrfy_prdbt(&unknown_key,
2302                             0, " ", handle, callback, 0, 0, vdp)) != 0) {
2303                                 if (ret == 0)
2304                                         ret = t_ret;
2305                                 goto err;
2306                         }
2307                 }
2308                 last = i;
2309
2310                 /*
2311                  * We're going to go try to print the next item.  If key is
2312                  * non-NULL, we're a dup page, so we've got to print the key
2313                  * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
2314                  * first entry.
2315                  */
2316                 if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) {
2317 #ifdef HAVE_COMPRESSION
2318                         last_key = unknown_dup_key ? NULL : key;
2319 #endif
2320                         if ((t_ret = __db_vrfy_prdbt(key,
2321                             0, " ", handle, callback, 0, 0, vdp)) != 0) {
2322                                 if (ret == 0)
2323                                         ret = t_ret;
2324                                 goto err;
2325                         }
2326                 }
2327
2328                 beg = end = inp[i];
2329                 switch (B_TYPE(bk->type)) {
2330                 case B_DUPLICATE:
2331                         if (pgtype == P_IBTREE)
2332                                 break;
2333
2334                         end = beg + BOVERFLOW_SIZE - 1;
2335                         /*
2336                          * If we're not on a normal btree leaf page, there
2337                          * shouldn't be off-page dup sets.  Something's
2338                          * confused; just drop it, and the code to pick up
2339                          * unlinked offpage dup sets will print it out
2340                          * with key "UNKNOWN" later.
2341                          */
2342                         if (pgtype != P_LBTREE)
2343                                 break;
2344
2345                         bo = (BOVERFLOW *)bk;
2346
2347                         /*
2348                          * If the page number is unreasonable, or if this is
2349                          * supposed to be a key item, output "UNKNOWN_KEY" --
2350                          * the best we can do is run into the data items in
2351                          * the unlinked offpage dup pass.
2352                          */
2353                         if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
2354                                 /* Not much to do on failure. */
2355 #ifdef HAVE_COMPRESSION
2356                                 if (key == NULL && i % P_INDX == 0)
2357                                         last_key = NULL;
2358 #endif
2359                                 if ((t_ret = __db_vrfy_prdbt(
2360                                  i % P_INDX == 0 ? &unknown_key : &unknown_data,
2361                                    0, " ", handle, callback, 0, 0,vdp)) != 0) {
2362                                         if (ret == 0)
2363                                                 ret = t_ret;
2364                                         goto err;
2365                                 }
2366                                 break;
2367                         }
2368
2369                         /* Don't stop on error. */
2370                         if ((t_ret = __db_salvage_duptree(dbp,
2371                             vdp, bo->pgno, &dbt, handle, callback,
2372                             flags | DB_SA_SKIPFIRSTKEY
2373 #ifdef HAVE_COMPRESSION
2374                             | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0)
2375 #endif
2376                             )) != 0 && ret == 0)
2377                                 ret = t_ret;
2378
2379                         break;
2380                 case B_KEYDATA:
2381                         if (pgtype == P_IBTREE)
2382                                 break;
2383
2384                         end = (db_indx_t)DB_ALIGN(
2385                             beg + bk->len, sizeof(u_int32_t)) - 1;
2386
2387                         dbt.data = bk->data;
2388                         dbt.size = bk->len;
2389
2390 #ifdef HAVE_COMPRESSION
2391                         if (DB_IS_COMPRESSED(dbp) && last_key != NULL &&
2392                             (key != NULL || (i % P_INDX == 1))) {
2393                                 /* Decompress the key/data pair  - the key
2394                                    is in last_key, and the data is in dbt */
2395                                 if ((t_ret = __bam_compress_salvage(dbp, vdp,
2396                                     handle, callback, last_key, &dbt)) != 0) {
2397                                         if (t_ret == DB_VERIFY_FATAL) {
2398                                                 if (ret == 0)
2399                                                         ret = DB_VERIFY_BAD;
2400                                                 if (!LF_ISSET(DB_AGGRESSIVE))
2401                                                         goto err;
2402                                         } else if (ret == 0) {
2403                                                 ret = t_ret;
2404                                                 goto err;
2405                                         }
2406                                 }
2407                         } else {
2408                                 if (key == NULL && i % P_INDX == 0) {
2409                                         if ((ret = __os_realloc(
2410                                             env, dbt.size, &kcpy.data)) != 0)
2411                                                 goto err;
2412                                         memcpy(kcpy.data, dbt.data, dbt.size);
2413                                         kcpy.size = dbt.size;
2414                                         last_key = &kcpy;
2415                                 }
2416 #endif
2417
2418                                 if ((t_ret = __db_vrfy_prdbt(&dbt,
2419                                    0, " ", handle, callback, 0, 0, vdp)) != 0) {
2420                                         if (ret == 0)
2421                                                 ret = t_ret;
2422                                         goto err;
2423                                 }
2424 #ifdef HAVE_COMPRESSION
2425                         }
2426 #endif
2427                         break;
2428                 case B_OVERFLOW:
2429                         if (pgtype != P_IBTREE)
2430                                 end = beg + BOVERFLOW_SIZE - 1;
2431                         bo = (BOVERFLOW *)bk;
2432
2433                         /*
2434                          * Check for replicated overflow keys, so that we only
2435                          * call __db_safe_goff once per overflow page.  If we
2436                          * get the same offset as the previous key just re-use
2437                          * the previous dbt.
2438                          *
2439                          * P_IBTREE pages will never have replicated overflow
2440                          * keys.
2441                          */
2442                         adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
2443                         if (pgtype == P_IBTREE) {
2444                                 /*
2445                                  * If we're looking at a P_IBTREE, we just want
2446                                  * to mark the overflow page as seen.
2447                                  *
2448                                  * Note that this call to __db_safe_goff differs
2449                                  * from the non-P_IBTREE call.
2450                                  *
2451                                  * Only call __db_safe_goff if the overflow page
2452                                  * hasn't been seen.
2453                                  */
2454                                 ovflpg = ((BOVERFLOW *)
2455                                     ((BINTERNAL *)bk)->data)->pgno;
2456                                 if (__db_salvage_isdone(vdp, ovflpg) == 0 &&
2457                                     (t_ret =__db_safe_goff(dbp, vdp, ovflpg,
2458                                         &dbt, &ovflbuf,
2459                                         &ovfl_bufsz, flags)) != 0 && ret == 0)
2460                                         ret = t_ret;
2461                                 break;
2462                         } else if (i > adj - 1 &&
2463                             i % adj == 0 && inp[i] == inp[i - adj])
2464                                 dbt = repldbt;
2465                         else {
2466                                 /* Don't stop on error. */
2467                                 if ((t_ret = __db_safe_goff(dbp, vdp,
2468                                     bo->pgno, &dbt, &ovflbuf,
2469                                     &ovfl_bufsz, flags)) != 0 && ret == 0)
2470                                         ret = t_ret;
2471
2472                                 /*
2473                                  * If this is a key, save it in case the next
2474                                  * key is a replicated overflow, so we don't
2475                                  * call __db_safe_goff again.  Copy out dbt.data
2476                                  * in case that pointer gets realloc'd when
2477                                  * getting a data item.
2478                                  */
2479                                 if (i % P_INDX == 0) {
2480                                         if (t_ret == 0) {
2481                                                 if ((t_ret = __os_realloc(env,
2482                                                         dbt.size,
2483                                                         &repldbt.data)) != 0) {
2484                                                         if (ret == 0)
2485                                                                 ret = t_ret;
2486                                                         goto err;
2487                                                 }
2488                                                 memcpy(repldbt.data,
2489                                                     dbt.data, dbt.size);
2490                                                 repldbt.size = dbt.size;
2491                                         } else {
2492                                                 if (__os_realloc(env,
2493                                                     unknown_key.size,
2494                                                     &repldbt.data) != 0)
2495                                                         goto err;
2496                                                 memcpy(repldbt.data,
2497                                                     unknown_key.data,
2498                                                     unknown_key.size);
2499                                                 repldbt.size = unknown_key.size;
2500                                         }
2501                                 }
2502
2503                         }
2504
2505 #ifdef HAVE_COMPRESSION
2506                         if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 &&
2507                             (key != NULL || (i % P_INDX == 1))) {
2508                                 /* Decompress the key/data pair  - the key
2509                                    is in last_key, and the data is in dbt */
2510                                 if ((t_ret = __bam_compress_salvage(dbp, vdp,
2511                                     handle, callback, last_key, &dbt)) != 0) {
2512                                         if (t_ret == DB_VERIFY_FATAL) {
2513                                                 if (ret == 0)
2514                                                         ret = DB_VERIFY_BAD;
2515                                                 if (!LF_ISSET(DB_AGGRESSIVE))
2516                                                         goto err;
2517                                         } else if (ret == 0) {
2518                                                 ret = t_ret;
2519                                                 goto err;
2520                                         }
2521                                 }
2522                         } else {
2523                                 if (key == NULL && i % P_INDX == 0) {
2524                                         if (t_ret == 0) {
2525                                                 if ((ret = __os_realloc(env,
2526                                                     dbt.size, &kcpy.data)) != 0)
2527                                                         goto err;
2528                                                 memcpy(kcpy.data, dbt.data,
2529                                                         dbt.size);
2530                                                 kcpy.size = dbt.size;
2531                                                 last_key = &kcpy;
2532                                         } else
2533                                                 last_key = NULL;
2534                                 }
2535 #endif
2536
2537                                 if ((t_ret = __db_vrfy_prdbt(
2538                                            t_ret == 0 ? &dbt : &unknown_key,
2539                                            0, " ", handle, callback, 0, 0, vdp))
2540                                         != 0 && ret == 0)
2541                                         ret = t_ret;
2542 #ifdef HAVE_COMPRESSION
2543                         }
2544 #endif
2545                         break;
2546                 default:
2547                         /*
2548                          * We should never get here; __db_vrfy_inpitem should
2549                          * not be returning 0 if bk->type is unrecognizable.
2550                          */
2551                         t_ret = __db_unknown_path(env, "__bam_salvage");
2552                         if (ret == 0)
2553                                 ret = t_ret;
2554                         goto err;
2555                 }
2556
2557                 /*
2558                  * If we're being aggressive, mark the beginning and end of
2559                  * the item; we'll come back and print whatever "junk" is in
2560                  * the gaps in case we had any bogus inp elements and thereby
2561                  * missed stuff.
2562                  */
2563                 if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
2564                         pgmap[beg] = VRFY_ITEM_BEGIN;
2565                         pgmap[end] = VRFY_ITEM_END;
2566                 }
2567         }
2568
2569 err:    if (pgmap != NULL)
2570                 __os_free(env, pgmap);
2571         if (ovflbuf != NULL)
2572                 __os_free(env, ovflbuf);
2573         if (repldbt.data != NULL)
2574                 __os_free(env, repldbt.data);
2575 #ifdef HAVE_COMPRESSION
2576         if (kcpy.data != NULL)
2577                 __os_free(env, kcpy.data);
2578 #endif
2579
2580         /* Mark this page as done. */
2581         if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
2582                 ret = t_ret;
2583
2584         return (ret);
2585 }
2586
2587 /*
2588  * __bam_salvage_walkdupint --
2589  *      Walk a known-good btree or recno internal page which is part of
2590  *      a dup tree, calling __db_salvage_duptree on each child page.
2591  *
2592  * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
2593  * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
2594  */
2595 int
2596 __bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
2597         DB *dbp;
2598         VRFY_DBINFO *vdp;
2599         PAGE *h;
2600         DBT *key;
2601         void *handle;
2602         int (*callback) __P((void *, const void *));
2603         u_int32_t flags;
2604 {
2605         BINTERNAL *bi;
2606         ENV *env;
2607         RINTERNAL *ri;
2608         int ret, t_ret;
2609         db_indx_t i;
2610
2611         env = dbp->env;
2612         ret = 0;
2613
2614         for (i = 0; i < NUM_ENT(h); i++) {
2615                 switch (TYPE(h)) {
2616                 case P_IBTREE:
2617                         bi = GET_BINTERNAL(dbp, h, i);
2618                         if ((t_ret = __db_salvage_duptree(dbp,
2619                             vdp, bi->pgno, key, handle, callback, flags)) != 0)
2620                                 ret = t_ret;
2621                         break;
2622                 case P_IRECNO:
2623                         ri = GET_RINTERNAL(dbp, h, i);
2624                         if ((t_ret = __db_salvage_duptree(dbp,
2625                             vdp, ri->pgno, key, handle, callback, flags)) != 0)
2626                                 ret = t_ret;
2627                         break;
2628                 default:
2629                         return (__db_unknown_path(
2630                             env, "__bam_salvage_walkdupint"));
2631                 }
2632                 /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
2633                 flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
2634         }
2635
2636         return (ret);
2637 }
2638
2639 /*
2640  * __bam_meta2pgset --
2641  *      Given a known-good meta page, return in pgsetp a 0-terminated list of
2642  *      db_pgno_t's corresponding to the pages in the btree.
2643  *
2644  *      We do this by a somewhat sleazy method, to avoid having to traverse the
2645  *      btree structure neatly:  we walk down the left side to the very
2646  *      first leaf page, then we mark all the pages in the chain of
2647  *      NEXT_PGNOs (being wary of cycles and invalid ones), then we
2648  *      consolidate our scratch array into a nice list, and return.  This
2649  *      avoids the memory management hassles of recursion and the
2650  *      trouble of walking internal pages--they just don't matter, except
2651  *      for the left branch.
2652  *
2653  * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
2654  * PUBLIC:     u_int32_t, DB *));
2655  */
2656 int
2657 __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
2658         DB *dbp;
2659         VRFY_DBINFO *vdp;
2660         BTMETA *btmeta;
2661         u_int32_t flags;
2662         DB *pgset;
2663 {
2664         BINTERNAL *bi;
2665         DB_MPOOLFILE *mpf;
2666         PAGE *h;
2667         RINTERNAL *ri;
2668         db_pgno_t current, p;
2669         int err_ret, ret;
2670
2671         DB_ASSERT(dbp->env, pgset != NULL);
2672
2673         mpf = dbp->mpf;
2674         h = NULL;
2675         ret = err_ret = 0;
2676
2677         for (current = btmeta->root;;) {
2678                 if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
2679                         err_ret = DB_VERIFY_BAD;
2680                         goto err;
2681                 }
2682                 if ((ret = __memp_fget(mpf, &current,
2683                      vdp->thread_info, NULL, 0, &h)) != 0) {
2684                         err_ret = ret;
2685                         goto err;
2686                 }
2687
2688                 switch (TYPE(h)) {
2689                 case P_IBTREE:
2690                 case P_IRECNO:
2691                         if ((ret = __bam_vrfy(dbp,
2692                             vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
2693                                 err_ret = ret;
2694                                 goto err;
2695                         }
2696                         if (TYPE(h) == P_IBTREE) {
2697                                 bi = GET_BINTERNAL(dbp, h, 0);
2698                                 current = bi->pgno;
2699                         } else {        /* P_IRECNO */
2700                                 ri = GET_RINTERNAL(dbp, h, 0);
2701                                 current = ri->pgno;
2702                         }
2703                         break;
2704                 case P_LBTREE:
2705                 case P_LRECNO:
2706                         goto traverse;
2707                 default:
2708                         err_ret = DB_VERIFY_BAD;
2709                         goto err;
2710                 }
2711
2712                 if ((ret = __memp_fput(mpf,
2713                      vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2714                         err_ret = ret;
2715                 h = NULL;
2716         }
2717
2718         /*
2719          * At this point, current is the pgno of leaf page h, the 0th in the
2720          * tree we're concerned with.
2721          */
2722 traverse:
2723         while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
2724                 if (h == NULL && (ret = __memp_fget(mpf,
2725                     &current, vdp->thread_info, NULL, 0, &h)) != 0) {
2726                         err_ret = ret;
2727                         break;
2728                 }
2729
2730                 if ((ret = __db_vrfy_pgset_get(pgset,
2731                     vdp->thread_info, vdp->txn, current, (int *)&p)) != 0)
2732                         goto err;
2733
2734                 if (p != 0) {
2735                         /*
2736                          * We've found a cycle.  Return success anyway--
2737                          * our caller may as well use however much of
2738                          * the pgset we've come up with.
2739                          */
2740                         break;
2741                 }
2742                 if ((ret = __db_vrfy_pgset_inc(
2743                     pgset, vdp->thread_info, vdp->txn, current)) != 0)
2744                         goto err;
2745
2746                 current = NEXT_PGNO(h);
2747                 if ((ret = __memp_fput(mpf,
2748                      vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2749                         err_ret = ret;
2750                 h = NULL;
2751         }
2752
2753 err:    if (h != NULL)
2754                 (void)__memp_fput(mpf,
2755                     vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
2756
2757         return (ret == 0 ? err_ret : ret);
2758 }
2759
2760 /*
2761  * __bam_safe_getdata --
2762  *
2763  *      Utility function for __bam_vrfy_itemorder.  Safely gets the datum at
2764  *      index i, page h, and sticks it in DBT dbt.  If ovflok is 1 and i's an
2765  *      overflow item, we do a safe_goff to get the item and signal that we need
2766  *      to free dbt->data;  if ovflok is 0, we leaves the DBT zeroed.
2767  */
2768 static int
2769 __bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
2770         DB *dbp;
2771         DB_THREAD_INFO *ip;
2772         PAGE *h;
2773         u_int32_t i;
2774         int ovflok;
2775         DBT *dbt;
2776         int *freedbtp;
2777 {
2778         BKEYDATA *bk;
2779         BOVERFLOW *bo;
2780         DBC *dbc;
2781         int ret;
2782
2783         memset(dbt, 0, sizeof(DBT));
2784         *freedbtp = 0;
2785
2786         bk = GET_BKEYDATA(dbp, h, i);
2787         if (B_TYPE(bk->type) == B_OVERFLOW) {
2788                 if (!ovflok)
2789                         return (0);
2790
2791                 if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
2792                     PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
2793                         return (ret);
2794                 bo = (BOVERFLOW *)bk;
2795                 F_SET(dbt, DB_DBT_MALLOC);
2796
2797                 *freedbtp = 1;
2798                 return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL));
2799         } else {
2800                 dbt->data = bk->data;
2801                 dbt->size = bk->len;
2802         }
2803
2804         return (0);
2805 }