2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1997-2009 Oracle. All rights reserved.
12 #include "dbinc/db_page.h"
13 #include "dbinc/btree.h"
14 #include "dbinc/lock.h"
17 static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
18 static int __ram_source __P((DB *));
19 static int __ram_sread __P((DBC *, db_recno_t));
20 static int __ram_update __P((DBC *, db_recno_t, int));
23 * In recno, there are two meanings to the on-page "deleted" flag. If we're
24 * re-numbering records, it means the record was implicitly created. We skip
25 * over implicitly created records if doing a cursor "next" or "prev", and
26 * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
27 * records, it means that the record was implicitly created, or was deleted.
28 * We skip over implicitly created or deleted records if doing a cursor "next"
29 * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
31 * If we're re-numbering records, then we have to detect in the cursor that
32 * a record was deleted, and adjust the cursor as necessary on the next get.
33 * If we're not re-numbering records, then we can detect that a record has
34 * been deleted by looking at the actual on-page record, so we completely
35 * ignore the cursor's delete flag. This is different from the B+tree code.
36 * It also maintains whether the cursor references a deleted record in the
37 * cursor, and it doesn't always check the on-page value.
39 #define CD_SET(cp) { \
40 if (F_ISSET(cp, C_RENUMBER)) \
41 F_SET(cp, C_DELETED); \
43 #define CD_CLR(cp) { \
44 if (F_ISSET(cp, C_RENUMBER)) { \
45 F_CLR(cp, C_DELETED); \
46 cp->order = INVALID_ORDER; \
49 #define CD_ISSET(cp) \
50 (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
53 * Macros for comparing the ordering of two cursors.
54 * cp1 comes before cp2 iff one of the following holds:
55 * cp1's recno is less than cp2's recno
56 * recnos are equal, both deleted, and cp1's order is less than cp2's
57 * recnos are equal, cp1 deleted, and cp2 not deleted
59 #define C_LESSTHAN(cp1, cp2) \
60 (((cp1)->recno < (cp2)->recno) || \
61 (((cp1)->recno == (cp2)->recno) && \
62 ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
63 (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
66 * cp1 is equal to cp2 iff their recnos and delete flags are identical,
67 * and if the delete flag is set their orders are also identical.
69 #define C_EQUAL(cp1, cp2) \
70 ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
71 (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
74 * Do we need to log the current cursor adjustment?
76 #define CURADJ_LOG(dbc) \
77 (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
80 * After a search, copy the found page into the cursor, discarding any
81 * currently held lock.
83 #define STACK_TO_CURSOR(cp, ret) { \
85 (cp)->page = (cp)->csp->page; \
86 (cp)->pgno = (cp)->csp->page->pgno; \
87 (cp)->indx = (cp)->csp->indx; \
88 if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
90 (cp)->lock = (cp)->csp->lock; \
91 (cp)->lock_mode = (cp)->csp->lock_mode; \
96 * Recno open function.
98 * PUBLIC: int __ram_open __P((DB *, DB_THREAD_INFO *,
99 * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
102 __ram_open(dbp, ip, txn, name, base_pgno, flags)
114 COMPQUIET(name, NULL);
115 t = dbp->bt_internal;
117 /* Start up the tree. */
118 if ((ret = __bam_read_root(dbp, ip, txn, base_pgno, flags)) != 0)
122 * If the user specified a source tree, open it and map it in.
125 * We don't complain if the user specified transactions or threads.
126 * It's possible to make it work, but you'd better know what you're
129 if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
132 /* If we're snapshotting an underlying source file, do it now. */
133 if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
134 /* Allocate a cursor. */
135 if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
138 /* Do the snapshot. */
139 if ((ret = __ram_update(dbc,
140 DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
143 /* Discard the cursor. */
144 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
153 * Recno append function.
155 * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
158 __ram_append(dbc, key, data)
165 cp = (BTREE_CURSOR *)dbc->internal;
168 * Make sure we've read in all of the backing source file. If
169 * we found the record or it simply didn't exist, add the
172 ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
173 if (ret == 0 || ret == DB_NOTFOUND)
174 ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
176 /* Return the record number. */
177 if (ret == 0 && key != NULL)
178 ret = __db_retcopy(dbc->env, key, &cp->recno,
179 sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
186 * Recno DBC->del function.
188 * PUBLIC: int __ramc_del __P((DBC *, u_int32_t));
191 __ramc_del(dbc, flags)
200 DB_LOCK next_lock, prev_lock;
202 db_pgno_t npgno, ppgno, save_npgno, save_ppgno;
203 int exact, nc, ret, stack, t_ret;
206 cp = (BTREE_CURSOR *)dbc->internal;
207 t = dbp->bt_internal;
209 save_npgno = save_ppgno = PGNO_INVALID;
210 LOCK_INIT(next_lock);
211 LOCK_INIT(prev_lock);
215 * The semantics of cursors during delete are as follows: in
216 * non-renumbering recnos, records are replaced with a marker
217 * containing a delete flag. If the record referenced by this cursor
218 * has already been deleted, we will detect that as part of the delete
219 * operation, and fail.
221 * In renumbering recnos, cursors which represent deleted items
222 * are flagged with the C_DELETED flag, and it is an error to
223 * call c_del a second time without an intervening cursor motion.
226 return (DB_KEYEMPTY);
228 /* Search the tree for the key; delete only deletes exact matches. */
229 retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
237 /* Copy the page into the cursor. */
238 STACK_TO_CURSOR(cp, ret);
243 * If re-numbering records, the on-page deleted flag can only mean
244 * that this record was implicitly created. Applications aren't
245 * permitted to delete records they never created, return an error.
247 * If not re-numbering records, the on-page deleted flag means that
248 * this record was implicitly created, or, was deleted at some time.
249 * The former is an error because applications aren't permitted to
250 * delete records they never created, the latter is an error because
251 * if the record was "deleted", we could never have found it.
253 if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
258 if (F_ISSET(cp, C_RENUMBER)) {
259 /* If we are going to drop the page, lock its neighbors. */
260 if (STD_LOCKING(dbc) &&
261 NUM_ENT(cp->page) == 1 && PGNO(cp->page) != cp->root) {
262 if ((npgno = NEXT_PGNO(cp->page)) != PGNO_INVALID)
263 TRY_LOCK(dbc, npgno, save_npgno,
264 next_lock, DB_LOCK_WRITE, retry);
267 if ((ppgno = PREV_PGNO(cp->page)) != PGNO_INVALID)
268 TRY_LOCK(dbc, ppgno, save_ppgno,
269 prev_lock, DB_LOCK_WRITE, retry);
273 /* Delete the item, adjust the counts, adjust the cursors. */
274 if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
276 if ((ret = __bam_adjust(dbc, -1)) != 0)
278 if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0)
281 CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn,
282 &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0)
286 * If the page is empty, delete it.
288 * We never delete a root page. First, root pages of primary
289 * databases never go away, recno or otherwise. However, if
290 * it's the root page of an off-page duplicates database, then
291 * it can be deleted. We don't delete it here because we have
292 * no way of telling the primary database page holder (e.g.,
293 * the hash access method) that its page element should cleaned
294 * up because the underlying tree is gone. So, we keep the page
295 * around until the last cursor referencing the empty tree is
296 * are closed, and then clean it up.
298 if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
300 * We want to delete a single item out of the last page
301 * that we're not deleting.
303 ret = __bam_dpages(dbc, 0, BTD_RELINK);
306 * Regardless of the return from __bam_dpages, it will
307 * discard our stack and pinned page.
312 cp->lock_mode = DB_LOCK_NG;
315 /* Use a delete/put pair to replace the record with a marker. */
316 if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
319 B_TSET_DELETED(bk.type, B_KEYDATA);
321 DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data));
322 DB_INIT_DBT(data, "", 0);
323 if ((ret = __db_pitem(dbc,
324 cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
330 err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
332 if ((t_ret = __TLPUT(dbc, next_lock)) != 0 && ret == 0)
334 if ((t_ret = __TLPUT(dbc, prev_lock)) != 0 && ret == 0)
342 * Recno DBC->get function.
344 * PUBLIC: int __ramc_get
345 * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
348 __ramc_get(dbc, key, data, flags, pgnop)
358 COMPQUIET(pgnop, NULL);
361 cp = (BTREE_CURSOR *)dbc->internal;
363 LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
364 retry: switch (flags) {
367 * If we're using mutable records and the deleted flag is
368 * set, the cursor is pointing at a nonexistent record;
372 return (DB_KEYEMPTY);
376 * If we're not in an off-page dup set, we know there's no
377 * next duplicate since recnos don't have them. If we
378 * are in an off-page dup set, the next item assuredly is
379 * a dup, so we set flags to DB_NEXT and keep going.
381 if (!F_ISSET(dbc, DBC_OPD))
382 return (DB_NOTFOUND);
386 * Recno databases don't have duplicates, set flags to DB_NEXT
393 * If record numbers are mutable: if we just deleted a record,
394 * we have to avoid incrementing the record number so that we
395 * return the right record by virtue of renumbering the tree.
399 * Clear the flag, we've moved off the deleted record.
405 if (cp->recno != RECNO_OOB) {
416 * If we're not in an off-page dup set, we know there's no
417 * previous duplicate since recnos don't have them. If we
418 * are in an off-page dup set, the previous item assuredly
419 * is a dup, so we set flags to DB_PREV and keep going.
421 if (!F_ISSET(dbc, DBC_OPD))
422 return (DB_NOTFOUND);
426 * Recno databases don't have duplicates, set flags to DB_PREV
432 if (cp->recno != RECNO_OOB) {
433 if (cp->recno == 1) {
443 if (((ret = __ram_update(dbc,
444 DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
446 if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
448 if (cp->recno == 0) {
455 * If we're doing a join and these are offpage dups,
456 * we want to keep searching forward from after the
457 * current cursor position. Increment the recno by 1,
458 * then proceed as for a DB_SET.
460 * Otherwise, we know there are no additional matching
461 * data, as recnos don't have dups. return DB_NOTFOUND.
463 if (F_ISSET(dbc, DBC_OPD)) {
471 case DB_GET_BOTH_RANGE:
473 * If we're searching a set of off-page dups, we start
474 * a new linear search from the first record. Otherwise,
475 * we compare the single data item associated with the
476 * requested record for a match.
478 if (F_ISSET(dbc, DBC_OPD)) {
485 if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
489 ret = __db_unknown_flag(dbp->env, "__ramc_get", flags);
494 * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
495 * called __ram_update() to make sure sufficient records have been
496 * read from the backing source file. Do it now for DB_CURRENT (if
497 * the current record was deleted we may need more records from the
498 * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
499 * (We don't have to test for flags == DB_FIRST, because the switch
500 * statement above re-set flags to DB_NEXT in that case.)
502 if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
503 __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
506 for (;; ++cp->recno) {
507 /* Search the tree for the record. */
508 if ((ret = __bam_rsearch(dbc, &cp->recno,
509 F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
517 /* Copy the page into the cursor. */
518 STACK_TO_CURSOR(cp, ret);
523 * If re-numbering records, the on-page deleted flag means this
524 * record was implicitly created. If not re-numbering records,
525 * the on-page deleted flag means this record was implicitly
526 * created, or, it was deleted at some time. Regardless, we
527 * skip such records if doing cursor next/prev operations or
528 * walking through off-page duplicates, and fail if they were
529 * requested explicitly by the application.
531 if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
535 (void)__bam_stkrel(dbc, STK_CLRDBC);
538 case DB_GET_BOTH_RANGE:
540 * If we're an OPD tree, we don't care about
541 * matching a record number on a DB_GET_BOTH
542 * -- everything belongs to the same tree. A
543 * normal recno should give up and return
544 * DB_NOTFOUND if the matching recno is deleted.
546 if (F_ISSET(dbc, DBC_OPD)) {
547 (void)__bam_stkrel(dbc, STK_CLRDBC);
557 if (flags == DB_GET_BOTH ||
558 flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
559 if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
560 __bam_defcmp, &cmp)) != 0)
564 if (!F_ISSET(dbc, DBC_OPD)) {
568 (void)__bam_stkrel(dbc, STK_CLRDBC);
573 /* Return the key if the user didn't give us one. */
574 if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) {
575 ret = __db_retcopy(dbp->env,
576 key, &cp->recno, sizeof(cp->recno),
577 &dbc->rkey->data, &dbc->rkey->ulen);
578 F_SET(key, DB_DBT_ISSET);
581 /* The cursor was reset, no further delete adjustment is necessary. */
589 * Recno DBC->put function.
591 * PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
594 __ramc_put(dbc, key, data, flags, pgnop)
605 int exact, nc, ret, t_ret;
608 COMPQUIET(pgnop, NULL);
612 cp = (BTREE_CURSOR *)dbc->internal;
615 * DB_KEYFIRST and DB_KEYLAST mean different things if they're
616 * used in an off-page duplicate tree. If we're an off-page
617 * duplicate tree, they really mean "put at the beginning of the
618 * tree" and "put at the end of the tree" respectively, so translate
619 * them to something else.
621 if (F_ISSET(dbc, DBC_OPD))
628 if ((ret = __ram_add(dbc,
629 &cp->recno, data, DB_APPEND, 0)) != 0)
631 if (CURADJ_LOG(dbc) &&
632 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
633 CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
641 * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
642 * no duplicates, these are identical and mean "put the given
643 * datum at the given recno".
645 if (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
646 flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP) {
647 ret = __ram_getno(dbc, key, &cp->recno, 1);
648 if (ret == 0 || ret == DB_NOTFOUND)
649 ret = __ram_add(dbc, &cp->recno, data, flags, 0);
654 * If we're putting with a cursor that's marked C_DELETED, we need to
655 * take special care; the cursor doesn't "really" reference the item
656 * corresponding to its current recno, but instead is "between" that
657 * record and the current one. Translate the actual insert into
658 * DB_BEFORE, and let the __ram_ca work out the gory details of what
659 * should wind up pointing where.
666 split: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0)
669 * An inexact match is okay; it just means we're one record past the
670 * end, which is reasonable if we're marked deleted.
672 DB_ASSERT(env, exact || CD_ISSET(cp));
674 /* Copy the page into the cursor. */
675 STACK_TO_CURSOR(cp, ret);
679 ret = __bam_iitem(dbc, key, data, iiflags, 0);
680 t_ret = __bam_stkrel(dbc, STK_CLRDBC);
682 if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
684 else if (ret == DB_NEEDSPLIT) {
686 if ((ret = __bam_split(dbc, arg, NULL)) != 0)
693 switch (flags) { /* Adjust the cursors. */
695 if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0)
699 * We only need to adjust this cursor forward if we truly added
700 * the item after the current recno, rather than remapping it
703 if (iiflags == DB_AFTER)
706 /* Only log if __ram_ca found any relevant cursors. */
707 if (nc > 0 && CURADJ_LOG(dbc) &&
708 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
709 cp->root, cp->recno, cp->order)) != 0)
713 if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0)
717 /* Only log if __ram_ca found any relevant cursors. */
718 if (nc > 0 && CURADJ_LOG(dbc) &&
719 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
720 cp->root, cp->recno, cp->order)) != 0)
725 * We only need to do an adjustment if we actually
726 * added an item, which we only would have done if the
727 * cursor was marked deleted.
732 /* Only log if __ram_ca found any relevant cursors. */
733 if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0)
735 if (nc > 0 && CURADJ_LOG(dbc) &&
736 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
737 CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
744 /* Return the key if we've created a new record. */
745 if (!F_ISSET(dbc, DBC_OPD) &&
746 (flags == DB_AFTER || flags == DB_BEFORE) && key != NULL)
747 ret = __db_retcopy(env, key, &cp->recno,
748 sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
750 /* The cursor was reset, no further delete adjustment is necessary. */
758 * Adjust cursors. Returns the number of relevant cursors.
760 * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *));
763 __ram_ca(dbc_arg, op, foundp)
768 BTREE_CURSOR *cp, *cp_arg;
778 cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
779 recno = cp_arg->recno;
782 * It only makes sense to adjust cursors if we're a renumbering
783 * recno; we should only be called if this is one.
785 DB_ASSERT(env, F_ISSET(cp_arg, C_RENUMBER));
787 MUTEX_LOCK(env, env->mtx_dblist);
789 * Adjust the cursors. See the comment in __bam_ca_delete().
791 * If we're doing a delete, we need to find the highest
792 * order of any cursor currently pointing at this item,
793 * so we can assign a higher order to the newly deleted
794 * cursor. Unfortunately, this requires a second pass through
797 if (op == CA_DELETE) {
798 FIND_FIRST_DB_MATCH(env, dbp, ldbp);
800 ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
801 ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
802 MUTEX_LOCK(env, dbp->mutex);
803 TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
804 cp = (BTREE_CURSOR *)dbc->internal;
805 if (cp_arg->root == cp->root &&
806 recno == cp->recno && CD_ISSET(cp) &&
807 order <= cp->order &&
808 !MVCC_SKIP_CURADJ(dbc, cp->root))
809 order = cp->order + 1;
811 MUTEX_UNLOCK(env, dbp->mutex);
814 order = INVALID_ORDER;
816 /* Now go through and do the actual adjustments. */
817 FIND_FIRST_DB_MATCH(env, dbp, ldbp);
819 ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
820 ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
821 MUTEX_LOCK(env, dbp->mutex);
822 TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
823 cp = (BTREE_CURSOR *)dbc->internal;
824 if (cp_arg->root != cp->root ||
825 MVCC_SKIP_CURADJ(dbc, cp->root))
831 if (recno < cp->recno) {
834 * If the adjustment made them equal,
835 * we have to merge the orders.
837 if (recno == cp->recno && CD_ISSET(cp))
839 } else if (recno == cp->recno &&
844 * If we're deleting the item, we can't
845 * keep a streaming offset cached.
847 cp->stream_start_pgno = PGNO_INVALID;
852 * IBEFORE is just like IAFTER, except that we
853 * adjust cursors on the current record too.
855 if (C_EQUAL(cp_arg, cp)) {
863 * If the original cursor wasn't deleted, we
864 * just did a replacement and so there's no
865 * need to adjust anything--we shouldn't have
866 * gotten this far. Otherwise, we behave
867 * much like an IAFTER, except that all
868 * cursors pointing to the current item get
869 * marked undeleted and point to the new
872 DB_ASSERT(env, CD_ISSET(cp_arg));
873 if (C_EQUAL(cp_arg, cp)) {
879 iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
883 if (recno == cp->recno && adjusted)
885 * If we've moved this cursor's recno,
886 * split its order number--i.e.,
887 * decrement it by enough so that
888 * the lowest cursor moved has order 1.
889 * cp_arg->order is the split point,
890 * so decrement by one less than that.
892 cp->order -= (cp_arg->order - 1);
896 MUTEX_UNLOCK(dbp->env, dbp->mutex);
898 MUTEX_UNLOCK(env, env->mtx_dblist);
907 * Check the user's record number, and make sure we've seen it.
909 * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
912 __ram_getno(dbc, key, rep, can_create)
923 /* If passed an empty DBT from Java, key->data may be NULL */
924 if (key->size != sizeof(db_recno_t)) {
925 __db_errx(dbp->env, "illegal record number size");
929 /* Check the user's record number. */
930 if ((recno = *(db_recno_t *)key->data) == 0) {
931 __db_errx(dbp->env, "illegal record number of 0");
938 * Btree can neither create records nor read them in. Recno can
939 * do both, see if we can find the record.
941 return (dbc->dbtype == DB_RECNO ?
942 __ram_update(dbc, recno, can_create) : 0);
947 * Ensure the tree has records up to and including the specified one.
950 __ram_update(dbc, recno, can_create)
962 t = dbp->bt_internal;
965 * If we can't create records and we've read the entire backing input
968 if (!can_create && t->re_eof)
972 * If we haven't seen this record yet, try to get it from the original
975 if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
977 if (!t->re_eof && recno > nrecs) {
978 if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
980 if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
985 * If we can create records, create empty ones up to the requested
988 if (!can_create || recno <= nrecs + 1)
991 rdata = &dbc->my_rdata;
995 while (recno > ++nrecs)
996 if ((ret = __ram_add(dbc,
997 &nrecs, rdata, 0, BI_DELETED)) != 0)
1004 * Load information about the backing file.
1016 t = dbp->bt_internal;
1018 /* Find the real name, and swap out the one we had before. */
1019 if ((ret = __db_appname(env,
1020 DB_APP_DATA, t->re_source, NULL, &source)) != 0)
1022 __os_free(env, t->re_source);
1023 t->re_source = source;
1027 * It's possible that the backing source file is read-only. We don't
1028 * much care other than we'll complain if there are any modifications
1029 * when it comes time to write the database back to the source.
1031 if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) {
1032 ret = __os_get_errno();
1033 __db_err(env, ret, "%s", t->re_source);
1042 * __ram_writeback --
1043 * Rewrite the backing file.
1045 * PUBLIC: int __ram_writeback __P((DB *));
1048 __ram_writeback(dbp)
1059 u_int8_t delim, *pad;
1061 t = dbp->bt_internal;
1066 /* If the file wasn't modified, we're done. */
1067 if (!t->re_modified)
1070 /* If there's no backing source file, we're done. */
1071 if (t->re_source == NULL) {
1077 * We step through the records, writing each one out. Use the record
1078 * number and the dbp->get() function, instead of a cursor, so we find
1079 * and write out "deleted" or non-existent records. The DB handle may
1080 * be threaded, so allocate memory as we go.
1082 memset(&key, 0, sizeof(key));
1083 key.size = sizeof(db_recno_t);
1085 memset(&data, 0, sizeof(data));
1086 F_SET(&data, DB_DBT_REALLOC);
1088 /* Allocate a cursor. */
1089 ENV_GET_THREAD_INFO(env, ip);
1090 if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
1094 * Read any remaining records into the tree.
1097 * This is why we can't support transactions when applications specify
1098 * backing (re_source) files. At this point we have to read in the
1099 * rest of the records from the file so that we can write all of the
1100 * records back out again, which could modify a page for which we'd
1101 * have to log changes and which we don't have locked. This could be
1102 * partially fixed by taking a snapshot of the entire file during the
1103 * DB->open as DB->open is transaction protected. But, if a checkpoint
1104 * occurs then, the part of the log holding the copy of the file could
1105 * be discarded, and that would make it impossible to recover in the
1106 * face of disaster. This could all probably be fixed, but it would
1107 * require transaction protecting the backing source file.
1110 * This could be made to work now that we have transactions protecting
1111 * file operations. Margo has specifically asked for the privilege of
1115 __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
1119 * Close any existing file handle and re-open the file, truncating it.
1121 if (t->re_fp != NULL) {
1122 if (fclose(t->re_fp) != 0) {
1123 ret = __os_get_errno();
1124 __db_err(env, ret, "%s", t->re_source);
1129 if ((fp = fopen(t->re_source, "wb")) == NULL) {
1130 ret = __os_get_errno();
1131 __db_err(env, ret, "%s", t->re_source);
1136 * We'll need the delimiter if we're doing variable-length records,
1137 * and the pad character if we're doing fixed-length records.
1139 delim = t->re_delim;
1140 for (keyno = 1;; ++keyno) {
1141 switch (ret = __db_get(dbp, ip, NULL, &key, &data, 0)) {
1143 if (data.size != 0 &&
1144 fwrite(data.data, 1, data.size, fp) != data.size)
1148 if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
1150 if ((ret = __os_malloc(
1151 env, t->re_len, &pad)) != 0)
1153 memset(pad, t->re_pad, t->re_len);
1155 if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
1165 if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
1166 fwrite(&delim, 1, 1, fp) != 1) {
1167 write_err: ret = __os_get_errno();
1169 "%s: write failed to backing file", t->re_source);
1175 done: /* Close the file descriptor. */
1176 if (fp != NULL && fclose(fp) != 0) {
1177 t_ret = __os_get_errno();
1178 __db_err(env, t_ret, "%s", t->re_source);
1183 /* Discard the cursor. */
1184 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
1187 /* Discard memory allocated to hold the data items. */
1188 if (data.data != NULL)
1189 __os_ufree(env, data.data);
1191 __os_free(env, pad);
1201 * Read records from a source file.
1204 __ram_sread(dbc, top)
1213 int ch, ret, was_modified;
1215 t = dbc->dbp->bt_internal;
1217 was_modified = t->re_modified;
1219 if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1223 * Use the record key return memory, it's only a short-term use.
1224 * The record data return memory is used by __bam_iitem, which
1225 * we'll indirectly call, so use the key so as not to collide.
1227 len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
1228 rdata = &dbc->my_rkey;
1229 if (rdata->ulen < len) {
1230 if ((ret = __os_realloc(
1231 dbp->env, len, &rdata->data)) != 0) {
1236 rdata->ulen = (u_int32_t)len;
1239 memset(&data, 0, sizeof(data));
1240 while (recno < top) {
1241 data.data = rdata->data;
1243 if (F_ISSET(dbp, DB_AM_FIXEDLEN))
1244 for (len = t->re_len; len > 0; --len) {
1245 if ((ch = fgetc(t->re_fp)) == EOF) {
1250 ((u_int8_t *)data.data)[data.size++] = ch;
1254 if ((ch = fgetc(t->re_fp)) == EOF) {
1259 if (ch == t->re_delim)
1262 ((u_int8_t *)data.data)[data.size++] = ch;
1263 if (data.size == rdata->ulen) {
1264 if ((ret = __os_realloc(dbp->env,
1266 &rdata->data)) != 0) {
1271 data.data = rdata->data;
1276 * Another process may have read this record from the input
1277 * file and stored it into the database already, in which
1278 * case we don't need to repeat that operation. We detect
1279 * this by checking if the last record we've read is greater
1280 * or equal to the number of records in the database.
1282 if (t->re_last >= recno) {
1284 if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1294 err: if (!was_modified)
1302 * Add records into the tree.
1305 __ram_add(dbc, recnop, data, flags, bi_flags)
1309 u_int32_t flags, bi_flags;
1312 int exact, ret, stack, t_ret;
1314 cp = (BTREE_CURSOR *)dbc->internal;
1316 retry: /* Find the slot for insertion. */
1317 if ((ret = __bam_rsearch(dbc, recnop,
1318 SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0)
1322 /* Copy the page into the cursor. */
1323 STACK_TO_CURSOR(cp, ret);
1327 if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
1328 !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
1334 * The application may modify the data based on the selected record
1337 if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
1338 (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
1342 * Select the arguments for __bam_iitem() and do the insert. If the
1343 * key is an exact match, or we're replacing the data item with a
1344 * new data item, replace the current item. If the key isn't an exact
1345 * match, we're inserting a new key/data pair, before the search
1348 switch (ret = __bam_iitem(dbc,
1349 NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
1352 * Don't adjust anything.
1354 * If we inserted a record, no cursors need adjusting because
1355 * the only new record it's possible to insert is at the very
1356 * end of the tree. The necessary adjustments to the internal
1357 * page counts were made by __bam_iitem().
1359 * If we overwrote a record, no cursors need adjusting because
1360 * future DBcursor->get calls will simply return the underlying
1361 * record (there's no adjustment made for the DB_CURRENT flag
1362 * when a cursor get operation immediately follows a cursor
1363 * delete operation, and the normal adjustment for the DB_NEXT
1364 * flag is still correct).
1368 /* Discard the stack of pages and split the page. */
1369 (void)__bam_stkrel(dbc, STK_CLRDBC);
1372 if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
1381 err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)