2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 2001-2009 Oracle. All rights reserved.
12 #include "dbinc/db_page.h"
13 #include "dbinc/fop.h"
14 #include "dbinc/db_am.h"
16 #include "dbinc/txn.h"
18 static int __fop_rename_recover_int
19 __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
20 static int __fop_rename_42_recover_int
21 __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
24 * The transactional guarantees Berkeley DB provides for file
25 * system level operations (database physical file create, delete,
26 * rename) are based on our understanding of current file system
27 * semantics; a system that does not provide these semantics and
28 * guarantees could be in danger.
30 * First, as in standard database changes, fsync and fdatasync must
31 * work: when applied to the log file, the records written into the
32 * log must be transferred to stable storage.
34 * Second, it must not be possible for the log file to be removed
35 * without previous file system level operations being flushed to
36 * stable storage. Berkeley DB applications write log records
37 * describing file system operations into the log, then perform the
38 * file system operation, then commit the enclosing transaction
39 * (which flushes the log file to stable storage). Subsequently,
40 * a database environment checkpoint may make it possible for the
41 * application to remove the log file containing the record of the
42 * file system operation. DB's transactional guarantees for file
43 * system operations require the log file removal not succeed until
44 * all previous filesystem operations have been flushed to stable
45 * storage. In other words, the flush of the log file, or the
46 * removal of the log file, must block until all previous
47 * filesystem operations have been flushed to stable storage. This
48 * semantic is not, as far as we know, required by any existing
49 * standards document, but we have never seen a filesystem where
54 * __fop_create_recover --
55 * Recovery function for create.
57 * PUBLIC: int __fop_create_recover
58 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
61 __fop_create_recover(env, dbtp, lsnp, op, info)
68 __fop_create_args *argp;
71 u_int8_t mbuf[DBMETASIZE];
76 COMPQUIET(info, NULL);
79 REC_PRINT(__fop_create_print);
80 REC_NOOP_INTRO(__fop_create_read);
81 meta = (DBMETA *)mbuf;
83 if (argp->dirname.size == 0)
86 dirname = (const char *)argp->dirname.data;
88 if ((ret = __db_appname(env, (APPNAME)argp->appname == DB_APP_DATA ?
89 DB_APP_RECOVER : (APPNAME)argp->appname,
90 (const char *)argp->name.data, &dirname, &real_name)) != 0)
95 * If the file was opened in mpool, we must mark it as
96 * dead via nameop which will also unlink the file.
98 if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
99 if (__fop_read_meta(env,
100 real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
101 __db_chk_meta(env, NULL, meta, 1) == 0) {
102 if ((ret = __memp_nameop(env,
103 meta->uid, NULL, real_name, NULL, 0)) != 0)
106 (void)__os_closehandle(env, fhp);
109 (void)__os_closehandle(env, fhp);
111 do_unlink: (void)__os_unlink(env, real_name, 0);
112 } else if (DB_REDO(op)) {
113 if ((ret = __os_open(env, real_name, 0,
114 DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
115 (void)__os_closehandle(env, fhp);
120 *lsnp = argp->prev_lsn;
122 out: if (real_name != NULL)
123 __os_free(env, real_name);
129 * __fop_create_42_recover --
130 * Recovery function for create.
132 * PUBLIC: int __fop_create_42_recover
133 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
136 __fop_create_42_recover(env, dbtp, lsnp, op, info)
143 __fop_create_args *argp;
146 u_int8_t mbuf[DBMETASIZE];
150 COMPQUIET(info, NULL);
153 REC_PRINT(__fop_create_print);
154 REC_NOOP_INTRO(__fop_create_read);
155 meta = (DBMETA *)mbuf;
157 if ((ret = __db_appname(env, (APPNAME)argp->appname,
158 (const char *)argp->name.data, NULL, &real_name)) != 0)
163 * If the file was opened in mpool, we must mark it as
164 * dead via nameop which will also unlink the file.
166 if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
167 if (__fop_read_meta(env,
168 real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
169 __db_chk_meta(env, NULL, meta, 1) == 0) {
170 if ((ret = __memp_nameop(env,
171 meta->uid, NULL, real_name, NULL, 0)) != 0)
175 (void)__os_closehandle(env, fhp);
177 do_unlink: (void)__os_unlink(env, real_name, 0);
178 } else if (DB_REDO(op)) {
179 if ((ret = __os_open(env, real_name, 0,
180 DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
181 (void)__os_closehandle(env, fhp);
186 *lsnp = argp->prev_lsn;
188 out: if (real_name != NULL)
189 __os_free(env, real_name);
195 * __fop_remove_recover --
196 * Recovery function for remove.
198 * PUBLIC: int __fop_remove_recover
199 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
202 __fop_remove_recover(env, dbtp, lsnp, op, info)
209 __fop_remove_args *argp;
213 COMPQUIET(info, NULL);
216 REC_PRINT(__fop_remove_print);
217 REC_NOOP_INTRO(__fop_remove_read);
219 if ((ret = __db_appname(env, (APPNAME)argp->appname,
220 (const char *)argp->name.data, NULL, &real_name)) != 0)
223 /* Its ok if the file is not there. */
225 (void)__memp_nameop(env,
226 (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
228 *lsnp = argp->prev_lsn;
229 out: if (real_name != NULL)
230 __os_free(env, real_name);
235 * __fop_write_recover --
236 * Recovery function for writechunk.
238 * PUBLIC: int __fop_write_recover
239 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
242 __fop_write_recover(env, dbtp, lsnp, op, info)
249 __fop_write_args *argp;
252 COMPQUIET(info, NULL);
254 REC_PRINT(__fop_write_print);
255 REC_NOOP_INTRO(__fop_write_read);
259 DB_ASSERT(env, argp->flag != 0);
260 else if (DB_REDO(op))
261 ret = __fop_write(env,
262 argp->txnp, argp->name.data,
263 argp->dirname.size == 0 ? NULL : argp->dirname.data,
264 (APPNAME)argp->appname == DB_APP_DATA ? DB_APP_RECOVER :
265 (APPNAME)argp->appname,
266 NULL, argp->pgsize, argp->pageno, argp->offset,
267 argp->page.data, argp->page.size, argp->flag, 0);
270 *lsnp = argp->prev_lsn;
275 * __fop_write_42_recover --
276 * Recovery function for writechunk.
278 * PUBLIC: int __fop_write_42_recover
279 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
282 __fop_write_42_recover(env, dbtp, lsnp, op, info)
289 __fop_write_args *argp;
292 COMPQUIET(info, NULL);
294 REC_PRINT(__fop_write_print);
295 REC_NOOP_INTRO(__fop_write_read);
299 DB_ASSERT(env, argp->flag != 0);
300 else if (DB_REDO(op))
301 ret = __fop_write(env,
302 argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname,
303 NULL, argp->pgsize, argp->pageno, argp->offset,
304 argp->page.data, argp->page.size, argp->flag, 0);
307 *lsnp = argp->prev_lsn;
312 * __fop_rename_recover --
313 * Recovery functions for rename. There are two variants that
314 * both use the same utility function. Had we known about this on day
315 * one, we would have simply added a parameter. However, since we need
316 * to retain old records for backward compatibility (online-upgrade)
317 * wrapping the two seems like the right solution.
319 * PUBLIC: int __fop_rename_recover
320 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
322 * PUBLIC: int __fop_rename_noundo_recover
323 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
326 __fop_rename_recover(env, dbtp, lsnp, op, info)
333 return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
337 __fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
344 return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
348 __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
356 __fop_rename_args *argp;
360 u_int8_t *fileid, mbuf[DBMETASIZE];
362 char *real_new, *real_old, *src;
365 COMPQUIET(info, NULL);
368 meta = (DBMETA *)&mbuf[0];
370 real_new = real_old = NULL;
372 REC_PRINT(__fop_rename_print);
373 REC_NOOP_INTRO(__fop_rename_read);
374 fileid = argp->fileid.data;
376 if (argp->dirname.size == 0)
379 dirname = (const char *)argp->dirname.data;
381 if ((APPNAME)argp->appname == DB_APP_DATA)
382 appname = DB_APP_RECOVER;
384 appname = (APPNAME)argp->appname;
386 if ((ret = __db_appname(env, appname, (const char *)argp->newname.data,
387 &dirname, &real_new)) != 0)
389 if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data,
390 &dirname, &real_old)) != 0)
394 * Verify that we are manipulating the correct file. We should always
395 * be OK on an ABORT or an APPLY, but during recovery, we have to
398 if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
399 src = DB_UNDO(op) ? real_new : real_old;
401 * Interpret any error as meaning that the file either doesn't
402 * exist, doesn't have a meta-data page, or is in some other
403 * way, shape or form, incorrect, so that we should not restore
406 if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
408 if (__fop_read_meta(env,
409 src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
411 if (__db_chk_meta(env, NULL, meta, 1) != 0)
413 if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
415 (void)__os_closehandle(env, fhp);
419 * Check to see if the target file exists. If it
420 * does and it does not have the proper id then
421 * it is a later version. We just remove the source
422 * file since the state of the world is beyond this
425 if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
426 __fop_read_meta(env, src, mbuf,
427 DBMETASIZE, fhp, 1, NULL) == 0 &&
428 __db_chk_meta(env, NULL, meta, 1) == 0 &&
429 memcmp(argp->fileid.data,
430 meta->uid, DB_FILE_ID_LEN) != 0) {
431 (void)__memp_nameop(env,
432 fileid, NULL, real_old, NULL, 0);
438 if (undo && DB_UNDO(op))
439 (void)__memp_nameop(env, fileid,
440 (const char *)argp->oldname.data, real_new, real_old, 0);
442 (void)__memp_nameop(env, fileid,
443 (const char *)argp->newname.data, real_old, real_new, 0);
445 done: *lsnp = argp->prev_lsn;
446 out: if (real_new != NULL)
447 __os_free(env, real_new);
448 if (real_old != NULL)
449 __os_free(env, real_old);
451 (void)__os_closehandle(env, fhp);
456 * __fop_rename_42_recover --
457 * Recovery functions for rename. There are two variants that
458 * both use the same utility function. Had we known about this on day
459 * one, we would have simply added a parameter. However, since we need
460 * to retain old records for backward compatibility (online-upgrade)
461 * wrapping the two seems like the right solution.
463 * PUBLIC: int __fop_rename_42_recover
464 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
466 * PUBLIC: int __fop_rename_noundo_46_recover
467 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
470 __fop_rename_42_recover(env, dbtp, lsnp, op, info)
477 return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 1));
481 __fop_rename_noundo_46_recover(env, dbtp, lsnp, op, info)
488 return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 0));
492 __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
500 __fop_rename_args *argp;
503 u_int8_t *fileid, mbuf[DBMETASIZE];
505 char *real_new, *real_old, *src;
507 COMPQUIET(info, NULL);
510 meta = (DBMETA *)&mbuf[0];
512 real_new = real_old = NULL;
514 REC_PRINT(__fop_rename_print);
515 REC_NOOP_INTRO(__fop_rename_read);
516 fileid = argp->fileid.data;
518 if ((ret = __db_appname(env, (APPNAME)argp->appname,
519 (const char *)argp->newname.data, NULL, &real_new)) != 0)
521 if ((ret = __db_appname(env, (APPNAME)argp->appname,
522 (const char *)argp->oldname.data, NULL, &real_old)) != 0)
526 * Verify that we are manipulating the correct file. We should always
527 * be OK on an ABORT or an APPLY, but during recovery, we have to
530 if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
531 src = DB_UNDO(op) ? real_new : real_old;
533 * Interpret any error as meaning that the file either doesn't
534 * exist, doesn't have a meta-data page, or is in some other
535 * way, shape or form, incorrect, so that we should not restore
538 if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
540 if (__fop_read_meta(env,
541 src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
543 if (__db_chk_meta(env, NULL, meta, 1) != 0)
545 if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
547 (void)__os_closehandle(env, fhp);
551 * Check to see if the target file exists. If it
552 * does and it does not have the proper id then
553 * it is a later version. We just remove the source
554 * file since the state of the world is beyond this
557 if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
558 __fop_read_meta(env, src, mbuf,
559 DBMETASIZE, fhp, 1, NULL) == 0 &&
560 __db_chk_meta(env, NULL, meta, 1) == 0 &&
561 memcmp(argp->fileid.data,
562 meta->uid, DB_FILE_ID_LEN) != 0) {
563 (void)__memp_nameop(env,
564 fileid, NULL, real_old, NULL, 0);
570 if (undo && DB_UNDO(op))
571 (void)__memp_nameop(env, fileid,
572 (const char *)argp->oldname.data, real_new, real_old, 0);
574 (void)__memp_nameop(env, fileid,
575 (const char *)argp->newname.data, real_old, real_new, 0);
577 done: *lsnp = argp->prev_lsn;
578 out: if (real_new != NULL)
579 __os_free(env, real_new);
580 if (real_old != NULL)
581 __os_free(env, real_old);
583 (void)__os_closehandle(env, fhp);
589 * __fop_file_remove_recover --
590 * Recovery function for file_remove. On the REDO pass, we need to
591 * make sure no one recreated the file while we weren't looking. On an
592 * undo pass must check if the file we are interested in is the one that
593 * exists and then set the status of the child transaction depending on
596 * PUBLIC: int __fop_file_remove_recover
597 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
600 __fop_file_remove_recover(env, dbtp, lsnp, op, info)
607 __fop_file_remove_args *argp;
611 u_int8_t mbuf[DBMETASIZE];
612 u_int32_t cstat, ret_stat;
613 int is_real, is_tmp, ret;
617 meta = (DBMETA *)&mbuf[0];
618 is_real = is_tmp = 0;
620 REC_PRINT(__fop_file_remove_print);
621 REC_NOOP_INTRO(__fop_file_remove_read);
624 * This record is only interesting on the backward, forward, and
627 if (op != DB_TXN_BACKWARD_ROLL &&
628 op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
631 if ((ret = __db_appname(env, (APPNAME)argp->appname,
632 argp->name.data, NULL, &real_name)) != 0)
635 /* Verify that we are manipulating the correct file. */
637 if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
638 (ret = __fop_read_meta(env, real_name,
639 mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
641 * If len is non-zero, then the file exists and has something
642 * in it, but that something isn't a full meta-data page, so
643 * this is very bad. Bail out!
648 /* File does not exist. */
649 cstat = TXN_EXPECTED;
652 * We can ignore errors here since we'll simply fail the
653 * checks below and assume this is the wrong file.
655 (void)__db_chk_meta(env, NULL, meta, 1);
657 memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
659 memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
661 if (!is_real && !is_tmp)
662 /* File exists, but isn't what we were removing. */
665 /* File exists and is the one that we were removing. */
669 (void)__os_closehandle(env, fhp);
674 /* On the backward pass, we leave a note for the child txn. */
675 if ((ret = __db_txnlist_update(env,
676 info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
678 } else if (DB_REDO(op)) {
680 * On the forward pass, check if someone recreated the
681 * file while we weren't looking.
683 if (cstat == TXN_COMMIT)
684 (void)__memp_nameop(env,
685 is_real ? argp->real_fid.data : argp->tmp_fid.data,
686 NULL, real_name, NULL, 0);
689 done: *lsnp = argp->prev_lsn;
692 out: if (real_name != NULL)
693 __os_free(env, real_name);
695 (void)__os_closehandle(env, fhp);