2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996-2009 Oracle. All rights reserved.
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 * Keith Bostic. All rights reserved.
11 * Copyright (c) 1990, 1993, 1994, 1995
12 * The Regents of the University of California. All rights reserved.
14 * This code is derived from software contributed to Berkeley by
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 #include "db_config.h"
47 #include "dbinc/crypto.h"
48 #include "dbinc/db_page.h"
49 #include "dbinc/db_swap.h"
50 #include "dbinc/btree.h"
51 #include "dbinc/lock.h"
52 #include "dbinc/log.h"
54 #include "dbinc/partition.h"
55 #include "dbinc/fop.h"
57 static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *));
63 * PUBLIC: int __bam_open __P((DB *, DB_THREAD_INFO *,
64 * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
67 __bam_open(dbp, ip, txn, name, base_pgno, flags)
77 COMPQUIET(name, NULL);
81 * We don't permit the user to specify a prefix routine if they didn't
82 * also specify a comparison routine, they can't know enough about our
83 * comparison routine to get it right.
85 if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) {
87 "prefix comparison may not be specified for default comparison routine");
92 * Verify that the bt_minkey value specified won't cause the
93 * calculation of ovflsize to underflow [#2406] for this pagesize.
95 if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) >
96 B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
98 "bt_minkey value of %lu too high for page size of %lu",
99 (u_long)t->bt_minkey, (u_long)dbp->pgsize);
103 /* Start up the tree. */
104 return (__bam_read_root(dbp, ip, txn, base_pgno, flags));
110 * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *));
113 __bam_metachk(dbp, name, btm)
125 * At this point, all we know is that the magic number is for a Btree.
126 * Check the version, the database may be out of date.
128 vers = btm->dbmeta.version;
129 if (F_ISSET(dbp, DB_AM_SWAP))
135 "%s: btree version %lu requires a version upgrade",
137 return (DB_OLD_VERSION);
143 "%s: unsupported btree version: %lu", name, (u_long)vers);
147 /* Swap the page if we need to. */
148 if (F_ISSET(dbp, DB_AM_SWAP) &&
149 (ret = __bam_mswap(env, (PAGE *)btm)) != 0)
153 * Check application info against metadata info, and set info, flags,
154 * and type based on metadata info.
157 __db_fchk(env, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0)
160 if (F_ISSET(&btm->dbmeta, BTM_RECNO)) {
161 if (dbp->type == DB_BTREE)
163 dbp->type = DB_RECNO;
164 DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
166 if (dbp->type == DB_RECNO)
168 dbp->type = DB_BTREE;
169 DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
172 if (F_ISSET(&btm->dbmeta, BTM_DUP))
173 F_SET(dbp, DB_AM_DUP);
175 if (F_ISSET(dbp, DB_AM_DUP)) {
177 "%s: DB_DUP specified to open method but not set in database",
182 if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) {
183 if (dbp->type != DB_BTREE)
185 F_SET(dbp, DB_AM_RECNUM);
187 if ((ret = __db_fcchk(env,
188 "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0)
191 if (F_ISSET(dbp, DB_AM_RECNUM)) {
193 "%s: DB_RECNUM specified to open method but not set in database",
198 if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) {
199 if (dbp->type != DB_RECNO)
201 F_SET(dbp, DB_AM_FIXEDLEN);
203 if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
205 "%s: DB_FIXEDLEN specified to open method but not set in database",
210 if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) {
211 if (dbp->type != DB_RECNO)
213 F_SET(dbp, DB_AM_RENUMBER);
215 if (F_ISSET(dbp, DB_AM_RENUMBER)) {
217 "%s: DB_RENUMBER specified to open method but not set in database",
222 if (F_ISSET(&btm->dbmeta, BTM_SUBDB))
223 F_SET(dbp, DB_AM_SUBDB);
225 if (F_ISSET(dbp, DB_AM_SUBDB)) {
227 "%s: multiple databases specified but not supported by file",
232 if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) {
233 if (dbp->dup_compare == NULL)
234 dbp->dup_compare = __bam_defcmp;
235 F_SET(dbp, DB_AM_DUPSORT);
237 if (dbp->dup_compare != NULL) {
239 "%s: duplicate sort specified but not supported in database",
244 #ifdef HAVE_COMPRESSION
245 if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
246 F_SET(dbp, DB_AM_COMPRESS);
247 if ((BTREE *)dbp->bt_internal != NULL &&
248 !DB_IS_COMPRESSED(dbp) &&
249 (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
252 if ((BTREE *)dbp->bt_internal != NULL &&
253 DB_IS_COMPRESSED(dbp)) {
255 "%s: compresssion specified to open method but not set in database",
261 if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
263 "%s: compression support has not been compiled in",
269 /* Set the page size. */
270 dbp->pgsize = btm->dbmeta.pagesize;
272 /* Copy the file's ID. */
273 memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
278 if (dbp->type == DB_BTREE)
280 "open method type is Btree, database type is Recno");
283 "open method type is Recno, database type is Btree");
289 * Read the root page and check a tree.
291 * PUBLIC: int __bam_read_root __P((DB *,
292 * PUBLIC: DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
295 __bam_read_root(dbp, ip, txn, base_pgno, flags)
312 t = dbp->bt_internal;
318 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
321 /* Get the metadata page. */
323 __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
325 if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
329 * If the magic number is set, the tree has been created. Correct
330 * any fields that may not be right. Note, all of the local flags
331 * were set by DB->open.
333 * Otherwise, we'd better be in recovery or abort, in which case the
334 * metadata page will be created/initialized elsewhere.
336 if (meta->dbmeta.magic == DB_BTREEMAGIC) {
337 t->bt_minkey = meta->minkey;
338 t->re_pad = (int)meta->re_pad;
339 t->re_len = meta->re_len;
341 t->bt_meta = base_pgno;
342 t->bt_root = meta->root;
343 #ifndef HAVE_FTRUNCATE
344 if (PGNO(meta) == PGNO_BASE_MD &&
345 !F_ISSET(dbp, DB_AM_RECOVER) && !IS_VERSION(dbp, meta))
346 __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno);
350 IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
355 * If creating a subdatabase, we've already done an insert when
356 * we put the subdatabase's entry into the master database, so
357 * our last-page-inserted value is wrongly initialized for the
358 * master database, not the subdatabase we're creating. I'm not
359 * sure where the *right* place to clear this value is, it's not
360 * intuitively obvious that it belongs here.
362 t->bt_lpgno = PGNO_INVALID;
364 err: /* Put the metadata page back. */
365 if (meta != NULL && (t_ret = __memp_fput(mpf,
366 ip, meta, dbc->priority)) != 0 && ret == 0)
368 if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
371 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
379 * Initialize a btree meta-data page. The following fields may need
380 * to be updated later: last_pgno, root.
383 __bam_init_meta(dbp, meta, pgno, lsnp)
390 #ifdef HAVE_PARTITION
396 t = dbp->bt_internal;
398 memset(meta, 0, sizeof(BTMETA));
399 meta->dbmeta.lsn = *lsnp;
400 meta->dbmeta.pgno = pgno;
401 meta->dbmeta.magic = DB_BTREEMAGIC;
402 meta->dbmeta.version = DB_BTREEVERSION;
403 meta->dbmeta.pagesize = dbp->pgsize;
404 if (F_ISSET(dbp, DB_AM_CHKSUM))
405 FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
406 if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
407 meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
408 DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
409 meta->crypto_magic = meta->dbmeta.magic;
411 meta->dbmeta.type = P_BTREEMETA;
412 meta->dbmeta.free = PGNO_INVALID;
413 meta->dbmeta.last_pgno = pgno;
414 if (F_ISSET(dbp, DB_AM_DUP))
415 F_SET(&meta->dbmeta, BTM_DUP);
416 if (F_ISSET(dbp, DB_AM_FIXEDLEN))
417 F_SET(&meta->dbmeta, BTM_FIXEDLEN);
418 if (F_ISSET(dbp, DB_AM_RECNUM))
419 F_SET(&meta->dbmeta, BTM_RECNUM);
420 if (F_ISSET(dbp, DB_AM_RENUMBER))
421 F_SET(&meta->dbmeta, BTM_RENUMBER);
422 if (F_ISSET(dbp, DB_AM_SUBDB))
423 F_SET(&meta->dbmeta, BTM_SUBDB);
424 if (dbp->dup_compare != NULL)
425 F_SET(&meta->dbmeta, BTM_DUPSORT);
426 #ifdef HAVE_COMPRESSION
427 if (DB_IS_COMPRESSED(dbp))
428 F_SET(&meta->dbmeta, BTM_COMPRESS);
430 if (dbp->type == DB_RECNO)
431 F_SET(&meta->dbmeta, BTM_RECNO);
432 memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
434 meta->minkey = t->bt_minkey;
435 meta->re_len = t->re_len;
436 meta->re_pad = (u_int32_t)t->re_pad;
438 #ifdef HAVE_PARTITION
439 if ((part = dbp->p_internal) != NULL) {
440 meta->dbmeta.nparts = part->nparts;
441 if (F_ISSET(part, PART_CALLBACK))
442 FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
443 if (F_ISSET(part, PART_RANGE))
444 FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
451 * Create the necessary pages to begin a new database file.
453 * This code appears more complex than it is because of the two cases (named
454 * and unnamed). The way to read the code is that for each page being created,
455 * there are three parts: 1) a "get page" chunk (which either uses malloc'd
456 * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
457 * chunk which either does a fop write or an __memp_fput.
459 * PUBLIC: int __bam_new_file __P((DB *,
460 * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
463 __bam_new_file(dbp, ip, txn, fhp, name)
487 if (F_ISSET(dbp, DB_AM_INMEM)) {
488 /* Build the meta-data page. */
490 if ((ret = __memp_fget(mpf, &pgno, ip, txn,
491 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
494 __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
496 meta->dbmeta.last_pgno = 1;
498 __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta)) != 0)
500 ret = __memp_fput(mpf, ip, meta, dbp->priority);
505 /* Build the root page. */
507 if ((ret = __memp_fget(mpf, &pgno,
508 ip, txn, DB_MPOOL_CREATE, &root)) != 0)
510 P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
511 LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
512 LSN_NOT_LOGGED(root->lsn);
514 __db_log_page(dbp, txn, &root->lsn, pgno, root)) != 0)
516 ret = __memp_fput(mpf, ip, root, dbp->priority);
521 memset(&pdbt, 0, sizeof(pdbt));
523 /* Build the meta-data page. */
524 pginfo.db_pagesize = dbp->pgsize;
526 F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
527 pginfo.type = dbp->type;
529 pdbt.size = sizeof(pginfo);
530 if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
532 meta = (BTMETA *)buf;
534 __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
536 meta->dbmeta.last_pgno = 1;
537 if ((ret = __db_pgout(
538 dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
540 if ((ret = __fop_write(env, txn, name, dbp->dirname,
542 dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
543 dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
547 /* Build the root page. */
549 memset(buf, CLEAR_BYTE, dbp->pgsize);
552 P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
553 LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
554 LSN_NOT_LOGGED(root->lsn);
556 __db_pgout(dbp->dbenv, root->pgno, root, &pdbt)) != 0)
559 __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
560 fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
561 dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
566 err: if (buf != NULL)
570 (t_ret = __memp_fput(mpf, ip,
571 meta, dbp->priority)) != 0 && ret == 0)
574 (t_ret = __memp_fput(mpf, ip,
575 root, dbp->priority)) != 0 && ret == 0)
583 * Create a metadata page and a root page for a new btree.
585 * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
588 __bam_new_subdb(mdbp, dbp, ip, txn)
608 if ((ret = __db_cursor(mdbp, ip, txn,
609 &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
612 /* Get, and optionally create the metadata page. */
613 if ((ret = __db_lget(dbc,
614 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
616 if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
617 ip, txn, DB_MPOOL_CREATE, &meta)) != 0)
620 /* Build meta-data page. */
621 lsn = meta->dbmeta.lsn;
622 __bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
623 if ((ret = __db_log_page(mdbp,
624 txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
627 /* Create and initialize a root page. */
628 if ((ret = __db_new(dbc,
629 dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, NULL, &root)) != 0)
631 root->level = LEAFLEVEL;
633 if (DBENV_LOGGING(env) &&
634 #if !defined(DEBUG_WOP)
638 (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0,
639 meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0)
642 meta->root = root->pgno;
644 __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0)
647 /* Release the metadata and root pages. */
648 if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
651 if ((ret = __memp_fput(mpf, ip, root, dbc->priority)) != 0)
656 if ((t_ret = __memp_fput(mpf, ip,
657 meta, dbc->priority)) != 0 && ret == 0)
660 if ((t_ret = __memp_fput(mpf, ip,
661 root, dbc->priority)) != 0 && ret == 0)
663 if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
666 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)