2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996-2009 Oracle. All rights reserved.
12 #include "dbinc/crypto.h"
13 #include "dbinc/hmac.h"
14 #include "dbinc/log.h"
15 #include "dbinc/txn.h"
17 static int __log_init __P((ENV *, DB_LOG *));
18 static int __log_recover __P((DB_LOG *));
19 static size_t __log_region_size __P((ENV *));
23 * Internal version of log_open: only called from ENV->open.
25 * PUBLIC: int __log_open __P((ENV *, int));
28 __log_open(env, create_ok)
36 int region_locked, ret;
41 /* Create/initialize the DB_LOG structure. */
42 if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
46 /* Set the default buffer size, if not otherwise configured. */
47 if (dbenv->lg_bsize == 0)
48 dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
49 LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
51 /* Join/create the log region. */
52 dblp->reginfo.env = env;
53 dblp->reginfo.type = REGION_TYPE_LOG;
54 dblp->reginfo.id = INVALID_REGION_ID;
55 dblp->reginfo.flags = REGION_JOIN_OK;
58 F_SET(&dblp->reginfo, REGION_CREATE_OK);
59 if ((ret = __env_region_attach(
60 env, &dblp->reginfo, __log_region_size(env))) != 0)
63 /* If we created the region, initialize it. */
64 if (F_ISSET(&dblp->reginfo, REGION_CREATE))
65 if ((ret = __log_init(env, dblp)) != 0)
68 /* Set the local addresses. */
69 lp = dblp->reginfo.primary =
70 R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary);
71 dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
74 * If the region is threaded, we have to lock the DBREG list, and we
75 * need to allocate a mutex for that purpose.
77 if ((ret = __mutex_alloc(env,
78 MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
82 * Set the handle -- we may be about to run recovery, which allocates
83 * log cursors. Log cursors require logging be already configured,
84 * and the handle being set is what demonstrates that.
86 * If we created the region, run recovery. If that fails, make sure
87 * we reset the log handle before cleaning up, otherwise we will try
88 * and clean up again in the mainline ENV initialization code.
90 env->lg_handle = dblp;
92 if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
94 * We first take the log file size from the environment, if
95 * specified. If that wasn't set, default it. Regardless,
96 * recovery may set it from the persistent information in a
99 if (lp->log_size == 0)
101 FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
102 LG_MAX_INMEM : LG_MAX_DEFAULT;
104 if ((ret = __log_recover(dblp)) != 0)
108 * If the next log file size hasn't been set yet, default it
109 * to the current log file size.
111 if (lp->log_nsize == 0)
112 lp->log_nsize = lp->log_size;
115 * If we haven't written any log files, write the first one
116 * so that checkpoint gets a valid ckp_lsn value.
118 if (IS_INIT_LSN(lp->lsn) &&
119 (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
123 * Initialize replication's next-expected LSN value
124 * and replication's bulk buffer. In __env_open, we
125 * always create/open the replication region before
126 * the log region so we're assured that our rep_handle
127 * is valid at this point, if replication is being used.
129 lp->ready_lsn = lp->lsn;
130 if (IS_ENV_REPLICATED(env)) {
132 __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
134 lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
135 lp->bulk_len = MEGABYTE;
137 lp->wait_ts = env->rep_handle->request_gap;
138 __os_gettime(env, &lp->rcvd_ts, 1);
140 lp->bulk_buf = INVALID_ROFF;
144 dblp->reginfo.mtx_alloc = lp->mtx_region;
147 * A process joining the region may have reset the log file
148 * size, too. If so, it only affects the next log file we
149 * create. We need to check that the size is reasonable given
150 * the buffer size in the region.
152 LOG_SYSTEM_LOCK(env);
155 if (dbenv->lg_size != 0) {
157 __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
160 lp->log_nsize = dbenv->lg_size;
163 LOG_SYSTEM_UNLOCK(env);
169 err: if (dblp->reginfo.addr != NULL) {
171 LOG_SYSTEM_UNLOCK(env);
172 (void)__env_region_detach(env, &dblp->reginfo, 0);
174 env->lg_handle = NULL;
176 (void)__mutex_free(env, &dblp->mtx_dbreg);
177 __os_free(env, dblp);
184 * Initialize a log region in shared memory.
187 __log_init(env, dblp)
199 * This is the first point where we can validate the buffer size,
200 * because we know all three settings have been configured (file size,
201 * buffer size and the in-memory flag).
204 __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
207 if ((ret = __env_alloc(&dblp->reginfo,
208 sizeof(*lp), &dblp->reginfo.primary)) != 0)
210 dblp->reginfo.rp->primary =
211 R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
212 lp = dblp->reginfo.primary;
213 memset(lp, 0, sizeof(*lp));
216 __mutex_alloc(env, MTX_LOG_REGION, 0, &lp->mtx_region)) != 0)
220 SH_TAILQ_INIT(&lp->fq);
221 lp->free_fid_stack = INVALID_ROFF;
222 lp->free_fids = lp->free_fids_alloced = 0;
224 /* Initialize LOG LSNs. */
229 * It's possible to be waiting for an LSN of [1][0], if a replication
230 * client gets the first log record out of order. An LSN of [0][0]
231 * signifies that we're not waiting.
233 ZERO_LSN(lp->waiting_lsn);
236 * Log makes note of the fact that it ran into a checkpoint on
237 * startup if it did so, as a recovery optimization. A zero
238 * LSN signifies that it hasn't found one [yet].
240 ZERO_LSN(lp->cached_ckp_lsn);
243 __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
245 if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
248 /* Initialize the buffer. */
249 if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
250 mem_err: __db_errx( env, "unable to allocate log region memory");
253 lp->regionmax = dbenv->lg_regionmax;
254 lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
255 lp->buffer_size = dbenv->lg_bsize;
256 lp->filemode = dbenv->lg_filemode;
257 lp->log_size = lp->log_nsize = dbenv->lg_size;
259 /* Initialize the commit Queue. */
260 SH_TAILQ_INIT(&lp->free_commits);
261 SH_TAILQ_INIT(&lp->commits);
264 /* Initialize the logfiles list for in-memory logs. */
265 SH_TAILQ_INIT(&lp->logfiles);
266 SH_TAILQ_INIT(&lp->free_logfiles);
269 * Fill in the log's persistent header. Don't fill in the log file
270 * sizes, as they may change at any time and so have to be filled in
271 * as each log file is created.
273 lp->persist.magic = DB_LOGMAGIC;
275 * Don't use __log_set_version because env->dblp isn't set up yet.
277 lp->persist.version = DB_LOGVERSION;
278 lp->persist.notused = 0;
279 env->lg_handle = dblp;
281 /* Migrate persistent flags from the ENV into the region. */
282 if (dbenv->lg_flags != 0 &&
283 (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
286 (void)time(&lp->timestamp);
304 u_int32_t cnt, rectype;
306 logfile_validity status;
311 lp = dblp->reginfo.primary;
314 * Find a log file. If none exist, we simply return, leaving
315 * everything initialized to a new log.
317 if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
323 * If the last file is an old, unreadable version, start a new
324 * file. Don't bother finding the end of the last log file;
325 * we assume that it's valid in its entirety, since the user
326 * should have shut down cleanly or run recovery before upgrading.
328 if (status == DB_LV_OLD_UNREADABLE) {
329 lp->lsn.file = lp->s_lsn.file = cnt + 1;
330 lp->lsn.offset = lp->s_lsn.offset = 0;
334 (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
337 * We have the last useful log file and we've loaded any persistent
338 * information. Set the end point of the log past the end of the last
339 * file. Read the last file, looking for the last checkpoint and
342 lp->lsn.file = cnt + 1;
348 * Allocate a cursor and set it to the first record. This shouldn't
349 * fail, leave error messages on.
351 if ((ret = __log_cursor(env, &logc)) != 0)
353 F_SET(logc, DB_LOG_LOCKED);
354 memset(&dbt, 0, sizeof(dbt));
355 if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
359 * Read to the end of the file. This may fail at some point, so
360 * turn off error messages.
362 F_SET(logc, DB_LOG_SILENT_ERR);
363 while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
364 if (dbt.size < sizeof(u_int32_t))
366 LOGCOPY_32(env, &rectype, dbt.data);
367 if (rectype == DB___txn_ckp)
369 * If we happen to run into a checkpoint, cache its
370 * LSN so that the transaction system doesn't have
371 * to walk this log file again looking for it.
373 lp->cached_ckp_lsn = lsn;
375 F_CLR(logc, DB_LOG_SILENT_ERR);
378 * We now know where the end of the log is. Set the first LSN that
379 * we want to return to an application and the LSN of the last known
384 lp->lsn.offset += logc->len;
385 lp->s_lsn.offset += logc->len;
387 /* Set up the current buffer information, too. */
391 lp->w_off = lp->lsn.offset;
394 if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
396 "Finding last valid log LSN: file: %lu offset %lu",
397 (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
399 err: if (logc != NULL)
400 (void)__logc_close(logc);
407 * Try to find a log file. If find_first is set, valp will contain
408 * the number of the first readable log file, else it will contain the number
409 * of the last log file (which may be too old to read).
411 * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
414 __log_find(dblp, find_first, valp, statusp)
418 logfile_validity *statusp;
422 logfile_validity logval_status, status;
423 struct __db_filestart *filestart;
424 u_int32_t clv, logval;
427 char *c, **names, *p, *q;
430 lp = dblp->reginfo.primary;
431 logval_status = status = DB_LV_NONEXISTENT;
433 /* Return a value of 0 as the log file number on failure. */
436 if (lp->db_log_inmemory) {
437 filestart = find_first ?
438 SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
439 SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
440 if (filestart != NULL) {
441 *valp = filestart->file;
442 logval_status = DB_LV_NORMAL;
444 *statusp = logval_status;
448 /* Find the directory name. */
449 if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
453 if ((q = __db_rpath(p)) == NULL)
460 /* Get the list of file names. */
461 retry: if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
462 __db_err(env, ret, "%s", dir);
467 /* Search for a valid log file name. */
468 for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
469 if (strncmp(names[cnt], LFPREFIX, sizeof(LFPREFIX) - 1) != 0)
473 * Names of the form log\.[0-9]* are reserved for DB. Other
474 * names sharing LFPREFIX, such as "log.db", are legal.
476 for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
477 if (!isdigit((int)*c))
483 * Use atol, not atoi; if an "int" is 16-bits, the largest
484 * log file name won't fit.
486 clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
489 * If searching for the first log file, we want to return the
490 * oldest log file we can read, or, if no readable log files
491 * exist, the newest log file we can't read (the crossover
492 * point between the old and new versions of the log file).
494 * If we're searching for the last log file, we want to return
495 * the newest log file, period.
497 * Readable log files should never precede unreadable log
498 * files, that would mean the admin seriously screwed up.
502 status != DB_LV_OLD_UNREADABLE && clv > logval)
505 if (logval != 0 && clv < logval)
508 if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
509 &status, NULL)) != 0) {
511 * If we have raced with removal of a log file since
512 * the call to __os_dirlist, it may no longer exist.
513 * In that case, just go on to the next one. If we're
514 * at the end of the list, all of the log files we saw
515 * initially are gone and we need to get the list again.
520 __os_dirfree(env, names, fcnt);
526 env, ret, "Invalid log file: %s", names[cnt]);
530 case DB_LV_NONEXISTENT:
531 /* __log_valid never returns DB_LV_NONEXISTENT. */
534 case DB_LV_INCOMPLETE:
536 * The last log file may not have been initialized --
537 * it's possible to create a log file but not write
538 * anything to it. If performing recovery (that is,
539 * if find_first isn't set), ignore the file, it's
540 * not interesting. If we're searching for the first
541 * log record, return the file (assuming we don't find
542 * something better), as the "real" first log record
543 * is likely to be in the log buffer, and we want to
544 * set the file LSN for our return.
549 case DB_LV_OLD_UNREADABLE:
551 * If we're searching for the first log file, then we
552 * only want this file if we don't yet have a file or
553 * already have an unreadable file and this one is
554 * newer than that one. If we're searching for the
555 * last log file, we always want this file because we
556 * wouldn't be here if it wasn't newer than our current
559 if (!find_first || logval == 0 ||
560 (status == DB_LV_OLD_UNREADABLE && clv > logval))
564 case DB_LV_OLD_READABLE:
566 logval_status = status;
573 err: __os_dirfree(env, names, fcnt);
575 *statusp = logval_status;
582 * Validate a log file. Returns an error code in the event of
583 * a fatal flaw in a the specified log file; returns success with
584 * a code indicating the currentness and completeness of the specified
585 * log file if it is not unexpectedly flawed (that is, if it's perfectly
586 * normal, if it's zero-length, or if it's an old version).
588 * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
589 * PUBLIC: DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
592 __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
598 logfile_validity *statusp;
601 DB_CIPHER *db_cipher;
607 logfile_validity status;
608 size_t hdrsize, nr, recsize;
614 db_cipher = env->crypto_handle;
617 status = DB_LV_NORMAL;
620 /* Return the file handle to our caller, on request */
625 flags = DB_OSO_RDONLY | DB_OSO_SEQ;
626 /* Try to open the log file. */
627 if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
628 __os_free(env, fname);
632 hdrsize = HDR_NORMAL_SZ;
634 recsize = sizeof(LOGP);
635 if (CRYPTO_ON(env)) {
636 hdrsize = HDR_CRYPTO_SZ;
637 recsize = sizeof(LOGP);
638 recsize += db_cipher->adj_size(recsize);
641 if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
645 persist = (LOGP *)(tmp + hdrsize);
648 * Try to read the header. This can fail if the log is truncated, or
649 * if we find a preallocated log file where the header has not yet been
650 * written, so we need to check whether the header is zero-filled.
652 if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
653 nr != recsize + hdrsize ||
654 (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
656 status = DB_LV_INCOMPLETE;
659 * The error was a fatal read error, not just an
660 * incompletely initialized log file.
662 __db_err(env, ret, "ignoring log file: %s", fname);
666 if (LOG_SWAPPED(env))
667 __log_hdrswap(hdr, CRYPTO_ON(env));
670 * Now we have to validate the persistent record. We have
671 * several scenarios we have to deal with:
673 * 1. User has crypto turned on:
674 * - They're reading an old, unencrypted log file
675 * . We will fail the record size match check below.
676 * - They're reading a current, unencrypted log file
677 * . We will fail the record size match check below.
678 * - They're reading an old, encrypted log file [NOT YET]
679 * . After decryption we'll fail the version check. [NOT YET]
680 * - They're reading a current, encrypted log file
681 * . We should proceed as usual.
682 * 2. User has crypto turned off:
683 * - They're reading an old, unencrypted log file
684 * . We will fail the version check.
685 * - They're reading a current, unencrypted log file
686 * . We should proceed as usual.
687 * - They're reading an old, encrypted log file [NOT YET]
688 * . We'll fail the magic number check (it is encrypted).
689 * - They're reading a current, encrypted log file
690 * . We'll fail the magic number check (it is encrypted).
692 if (CRYPTO_ON(env)) {
694 * If we are trying to decrypt an unencrypted log
695 * we can only detect that by having an unreasonable
696 * data length for our persistent data.
698 if ((hdr->len - hdrsize) != sizeof(LOGP)) {
699 __db_errx(env, "log record size mismatch");
702 /* Check the checksum and decrypt. */
703 if ((ret = __db_check_chksum(env, hdr, db_cipher,
704 &hdr->chksum[0], (u_int8_t *)persist,
705 hdr->len - hdrsize, is_hmac)) != 0) {
706 __db_errx(env, "log record checksum mismatch");
710 if ((ret = db_cipher->decrypt(env, db_cipher->data,
711 &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
715 /* Swap the header, if necessary. */
716 if (LOG_SWAPPED(env)) {
718 * If the magic number is not byte-swapped, we're looking at an
719 * old log that we can no longer read.
721 if (persist->magic == DB_LOGMAGIC) {
723 "Ignoring log file: %s historic byte order", fname);
724 status = DB_LV_OLD_UNREADABLE;
728 __log_persistswap(persist);
731 /* Validate the header. */
732 if (persist->magic != DB_LOGMAGIC) {
734 "Ignoring log file: %s: magic number %lx, not %lx",
735 fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC);
741 * Set our status code to indicate whether the log file belongs to an
742 * unreadable or readable old version; leave it alone if and only if
743 * the log file version is the current one.
745 if (persist->version > DB_LOGVERSION) {
746 /* This is a fatal error--the log file is newer than DB. */
748 "Unacceptable log file %s: unsupported log version %lu",
749 fname, (u_long)persist->version);
752 } else if (persist->version < DB_LOGOLDVER) {
753 status = DB_LV_OLD_UNREADABLE;
754 /* This is a non-fatal error, but give some feedback. */
756 "Skipping log file %s: historic log version %lu",
757 fname, (u_long)persist->version);
759 * We don't want to set persistent info based on an unreadable
760 * region, so jump to "err".
763 } else if (persist->version < DB_LOGVERSION)
764 status = DB_LV_OLD_READABLE;
767 * Only if we have a current log do we verify the checksum. We could
768 * not check the checksum before checking the magic and version because
769 * old log headers put the length and checksum in a different location.
770 * The checksum was calculated with the swapped byte order, so we need
771 * to check it with the same bytes.
773 if (!CRYPTO_ON(env)) {
774 if (LOG_SWAPPED(env))
775 __log_persistswap(persist);
777 if ((ret = __db_check_chksum(env,
778 hdr, db_cipher, &hdr->chksum[0], (u_int8_t *)persist,
779 hdr->len - hdrsize, is_hmac)) != 0) {
780 __db_errx(env, "log record checksum mismatch");
784 if (LOG_SWAPPED(env))
785 __log_persistswap(persist);
789 * If the log is readable so far and we're doing system initialization,
790 * set the region's persistent information based on the headers.
792 * Override the current log file size.
795 lp = dblp->reginfo.primary;
796 lp->log_size = persist->log_size;
797 lp->persist.version = persist->version;
799 if (versionp != NULL)
800 *versionp = persist->version;
802 err: if (fname != NULL)
803 __os_free(env, fname);
804 if (ret == 0 && fhpp != NULL)
807 /* Must close on error or if we only used it locally. */
808 (void)__os_closehandle(env, fhp);
819 * __log_env_refresh --
820 * Clean up after the log system on a close or failed open.
822 * PUBLIC: int __log_env_refresh __P((ENV *));
825 __log_env_refresh(env)
832 struct __db_commit *commit;
833 struct __db_filestart *filestart;
836 dblp = env->lg_handle;
837 reginfo = &dblp->reginfo;
838 lp = reginfo->primary;
842 * Flush the log if it's private -- there's no Berkeley DB guarantee
843 * that this gets done, but in case the application has forgotten to
844 * flush for durability, it's the polite thing to do.
846 if (F_ISSET(env, ENV_PRIVATE) &&
847 (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
850 if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
854 * After we close the files, check for any unlogged closes left in
855 * the shared memory queue. If we find any, try to log it, otherwise
856 * return the error. We cannot say the environment was closed
859 MUTEX_LOCK(env, lp->mtx_filelist);
860 SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
861 if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
862 (t_ret = __dbreg_close_id_int(
863 env, fnp, DBREG_CLOSE, 1)) != 0)
865 MUTEX_UNLOCK(env, lp->mtx_filelist);
868 * If a private region, return the memory to the heap. Not needed for
869 * filesystem-backed or system shared memory regions, that memory isn't
870 * owned by any particular process.
872 if (F_ISSET(env, ENV_PRIVATE)) {
873 reginfo->mtx_alloc = MUTEX_INVALID;
874 /* Discard the flush mutex. */
876 __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
879 /* Discard the buffer. */
880 __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
882 /* Discard stack of free file IDs. */
883 if (lp->free_fid_stack != INVALID_ROFF)
884 __env_alloc_free(reginfo,
885 R_ADDR(reginfo, lp->free_fid_stack));
887 /* Discard the list of in-memory log file markers. */
888 while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
889 __db_filestart)) != NULL) {
890 SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
892 __env_alloc_free(reginfo, filestart);
895 while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
896 __db_filestart)) != NULL) {
897 SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
899 __env_alloc_free(reginfo, filestart);
902 /* Discord commit queue elements. */
903 while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
904 __db_commit)) != NULL) {
905 SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
907 __env_alloc_free(reginfo, commit);
910 /* Discard replication bulk buffer. */
911 if (lp->bulk_buf != INVALID_ROFF) {
912 __env_alloc_free(reginfo,
913 R_ADDR(reginfo, lp->bulk_buf));
914 lp->bulk_buf = INVALID_ROFF;
918 /* Discard the per-thread DBREG mutex. */
919 if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
922 /* Detach from the region. */
923 if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
926 /* Close open files, release allocated memory. */
927 if (dblp->lfhp != NULL) {
929 __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
933 if (dblp->dbentry != NULL)
934 __os_free(env, dblp->dbentry);
936 __os_free(env, dblp);
938 env->lg_handle = NULL;
943 * __log_get_cached_ckp_lsn --
944 * Retrieve any last checkpoint LSN that we may have found on startup.
946 * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
949 __log_get_cached_ckp_lsn(env, ckp_lsnp)
956 dblp = env->lg_handle;
957 lp = (LOG *)dblp->reginfo.primary;
959 LOG_SYSTEM_LOCK(env);
960 *ckp_lsnp = lp->cached_ckp_lsn;
961 LOG_SYSTEM_UNLOCK(env);
967 * __log_region_mutex_count --
968 * Return the number of mutexes the log region will need.
970 * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
973 __log_region_mutex_count(env)
977 * We need a few assorted mutexes, and one per transaction waiting
978 * on the group commit list. We can't know how many that will be,
979 * but it should be bounded by the maximum active transactions.
981 return (env->dbenv->tx_max + 5);
985 * __log_region_size --
986 * Return the amount of space needed for the log region.
987 * Make the region large enough to hold txn_max transaction
988 * detail structures plus some space to hold thread handles
989 * and the beginning of the alloc region and anything we
990 * need for mutex system resource recording.
993 __log_region_size(env)
1001 s = dbenv->lg_regionmax + dbenv->lg_bsize;
1004 * If running with replication, add in space for bulk buffer.
1005 * Allocate a megabyte and a little bit more space.
1007 if (IS_ENV_REPLICATED(env))
1015 * This is a virtual truncate. We set up the log indicators to
1016 * make everyone believe that the given record is the last one in the
1017 * log. Returns with the next valid LSN (i.e., the LSN of the next
1018 * record to be written). This is used in replication to discard records
1019 * in the log file that do not agree with the master.
1021 * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
1024 __log_vtruncate(env, lsn, ckplsn, trunclsn)
1026 DB_LSN *lsn, *ckplsn, *trunclsn;
1032 u_int32_t bytes, len;
1035 /* Need to find out the length of this soon-to-be-last record. */
1036 if ((ret = __log_cursor(env, &logc)) != 0)
1038 memset(&log_dbt, 0, sizeof(log_dbt));
1039 ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
1041 if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
1046 /* Now do the truncate. */
1047 dblp = env->lg_handle;
1048 lp = (LOG *)dblp->reginfo.primary;
1050 LOG_SYSTEM_LOCK(env);
1053 * Flush the log so we can simply initialize the in-memory buffer
1054 * after the truncate.
1056 if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
1061 lp->lsn.offset += lp->len;
1063 if (lp->db_log_inmemory &&
1064 (ret = __log_inmem_lsnoff(dblp, &lp->lsn, &lp->b_off)) != 0)
1068 * I am going to assume that the number of bytes written since
1069 * the last checkpoint doesn't exceed a 32-bit number.
1071 DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
1073 if (ckplsn->file != lp->lsn.file) {
1074 bytes = lp->log_size - ckplsn->offset;
1075 if (lp->lsn.file > ckplsn->file + 1)
1076 bytes += lp->log_size *
1077 ((lp->lsn.file - ckplsn->file) - 1);
1078 bytes += lp->lsn.offset;
1080 bytes = lp->lsn.offset - ckplsn->offset;
1082 lp->stat.st_wc_mbytes += bytes / MEGABYTE;
1083 lp->stat.st_wc_bytes += bytes % MEGABYTE;
1086 * If the synced lsn is greater than our new end of log, reset it
1087 * to our current end of log.
1089 MUTEX_LOCK(env, lp->mtx_flush);
1090 if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
1091 lp->s_lsn = lp->lsn;
1092 MUTEX_UNLOCK(env, lp->mtx_flush);
1094 /* Initialize the in-region buffer to a pristine state. */
1095 ZERO_LSN(lp->f_lsn);
1096 lp->w_off = lp->lsn.offset;
1098 if (trunclsn != NULL)
1099 *trunclsn = lp->lsn;
1101 /* Truncate the log to the new point. */
1102 if ((ret = __log_zero(env, &lp->lsn)) != 0)
1105 err: LOG_SYSTEM_UNLOCK(env);
1110 * __log_is_outdated --
1111 * Used by the replication system to identify if a client's logs are too
1114 * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
1117 __log_is_outdated(env, fnum, outdatedp)
1127 struct __db_filestart *filestart;
1129 dblp = env->lg_handle;
1132 * The log represented by env is compared to the file number passed
1133 * in fnum. If the log file fnum does not exist and is lower-numbered
1134 * than the current logs, return *outdatedp non-zero, else we return 0.
1136 if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
1137 LOG_SYSTEM_LOCK(env);
1138 lp = (LOG *)dblp->reginfo.primary;
1139 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1140 *outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
1141 LOG_SYSTEM_UNLOCK(env);
1146 if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
1147 __os_free(env, name);
1151 /* If the file exists, we're just fine. */
1152 if (__os_exists(env, name, NULL) == 0)
1156 * It didn't exist, decide if the file number is too big or
1157 * too little. If it's too little, then we need to indicate
1158 * that the LSN is outdated.
1160 LOG_SYSTEM_LOCK(env);
1161 lp = (LOG *)dblp->reginfo.primary;
1162 cfile = lp->lsn.file;
1163 LOG_SYSTEM_UNLOCK(env);
1167 out: __os_free(env, name);
1173 * Zero out the tail of a log after a truncate.
1175 * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
1178 __log_zero(env, from_lsn)
1185 struct __db_filestart *filestart, *nextstart;
1186 size_t nbytes, len, nw;
1187 u_int32_t fn, mbytes, bytes;
1192 dblp = env->lg_handle;
1193 lp = (LOG *)dblp->reginfo.primary;
1194 DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
1195 if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
1197 "Warning: truncating to point beyond end of log");
1201 if (lp->db_log_inmemory) {
1203 * Remove the files that are invalidated by this truncate.
1205 for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1206 filestart != NULL; filestart = nextstart) {
1207 nextstart = SH_TAILQ_NEXT(filestart,
1208 links, __db_filestart);
1209 if (filestart->file > from_lsn->file) {
1210 SH_TAILQ_REMOVE(&lp->logfiles,
1211 filestart, links, __db_filestart);
1212 SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
1213 filestart, links, __db_filestart);
1220 /* Close any open file handles so unlinks don't fail. */
1221 if (dblp->lfhp != NULL) {
1222 (void)__os_closehandle(env, dblp->lfhp);
1226 /* Throw away any extra log files that we have around. */
1227 for (fn = from_lsn->file + 1;; fn++) {
1228 if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
1229 __os_free(env, fname);
1232 (void)__os_closehandle(env, fhp);
1233 (void)time(&lp->timestamp);
1234 ret = __os_unlink(env, fname, 0);
1235 __os_free(env, fname);
1240 /* We removed some log files; have to 0 to end of file. */
1242 __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
1243 __os_free(env, fname);
1246 __os_free(env, fname);
1247 if ((ret = __os_ioinfo(env,
1248 NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
1250 DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
1251 len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
1253 memset(buf, 0, sizeof(buf));
1255 /* Initialize the write position. */
1256 if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
1260 nbytes = len > sizeof(buf) ? sizeof(buf) : len;
1262 __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
1267 err: (void)__os_closehandle(env, dblp->lfhp);
1274 * __log_inmem_lsnoff --
1275 * Find the offset in the buffer of a given LSN.
1277 * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
1280 __log_inmem_lsnoff(dblp, lsnp, offsetp)
1286 struct __db_filestart *filestart;
1288 lp = (LOG *)dblp->reginfo.primary;
1290 SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
1291 if (filestart->file == lsnp->file) {
1293 (filestart->b_off + lsnp->offset) % lp->buffer_size;
1297 return (DB_NOTFOUND);
1301 * __log_inmem_newfile --
1302 * Records the offset of the beginning of a new file in the in-memory
1305 * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
1308 __log_inmem_newfile(dblp, file)
1314 struct __db_filestart *filestart;
1317 struct __db_filestart *first, *last;
1320 lp = (LOG *)dblp->reginfo.primary;
1323 * If the log buffer is empty, reuse the filestart entry.
1325 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1326 if (filestart != NULL &&
1327 RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
1328 sizeof(HDR) + sizeof(LOGP)) {
1329 filestart->file = file;
1330 filestart->b_off = lp->b_off;
1335 * We write an empty header at the end of every in-memory log file.
1336 * This is used during cursor traversal to indicate when to switch the
1337 * LSN to the next file.
1340 memset(&hdr, 0, sizeof(HDR));
1341 __log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
1342 lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
1345 filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
1346 if (filestart == NULL) {
1347 if ((ret = __env_alloc(&dblp->reginfo,
1348 sizeof(struct __db_filestart), &filestart)) != 0)
1350 memset(filestart, 0, sizeof(*filestart));
1352 SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
1353 links, __db_filestart);
1355 filestart->file = file;
1356 filestart->b_off = lp->b_off;
1359 first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1360 last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
1362 /* Check that we don't wrap. */
1363 DB_ASSERT(dblp->env, !first || first == last ||
1364 RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
1365 RINGBUF_LEN(lp, first->b_off, last->b_off) +
1366 RINGBUF_LEN(lp, last->b_off, lp->b_off));
1369 SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
1374 * __log_inmem_chkspace --
1375 * Ensure that the requested amount of space is available in the buffer,
1376 * and invalidate the region.
1377 * Note: assumes that the region lock is held on entry.
1379 * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
1382 __log_inmem_chkspace(dblp, len)
1386 DB_LSN active_lsn, old_active_lsn;
1389 struct __db_filestart *filestart;
1393 lp = dblp->reginfo.primary;
1395 DB_ASSERT(env, lp->db_log_inmemory);
1398 * Allow room for an extra header so that we don't need to check for
1399 * space when switching files.
1404 * If transactions are enabled and we're about to fill available space,
1405 * update the active LSN and recheck. If transactions aren't enabled,
1406 * don't even bother checking: in that case we can always overwrite old
1407 * log records, because we're never going to abort.
1409 while (TXN_ON(env) &&
1410 RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
1411 old_active_lsn = lp->active_lsn;
1412 active_lsn = lp->lsn;
1415 * Drop the log region lock so we don't hold it while
1416 * taking the transaction region lock.
1418 LOG_SYSTEM_UNLOCK(env);
1419 ret = __txn_getactive(env, &active_lsn);
1420 LOG_SYSTEM_LOCK(env);
1423 active_lsn.offset = 0;
1425 /* If we didn't make any progress, give up. */
1426 if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
1428 "In-memory log buffer is full (an active transaction spans the buffer)");
1429 return (DB_LOG_BUFFER_FULL);
1432 /* Make sure we're moving the region LSN forwards. */
1433 if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
1434 lp->active_lsn = active_lsn;
1435 (void)__log_inmem_lsnoff(dblp, &active_lsn,
1441 * Remove the first file if it is invalidated by this write.
1442 * Log records can't be bigger than a file, so we only need to
1443 * check the first file.
1445 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1446 if (filestart != NULL &&
1447 RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
1448 SH_TAILQ_REMOVE(&lp->logfiles, filestart,
1449 links, __db_filestart);
1450 SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
1451 links, __db_filestart);
1452 lp->f_lsn.file = filestart->file + 1;
1459 * __log_inmem_copyout --
1460 * Copies the given number of bytes from the buffer -- no checking.
1461 * Note: assumes that the region lock is held on entry.
1463 * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
1466 __log_inmem_copyout(dblp, offset, buf, size)
1475 lp = (LOG *)dblp->reginfo.primary;
1476 nbytes = (offset + size < lp->buffer_size) ?
1477 size : lp->buffer_size - offset;
1478 memcpy(buf, dblp->bufp + offset, nbytes);
1480 memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
1484 * __log_inmem_copyin --
1485 * Copies the given number of bytes into the buffer -- no checking.
1486 * Note: assumes that the region lock is held on entry.
1488 * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
1491 __log_inmem_copyin(dblp, offset, buf, size)
1500 lp = (LOG *)dblp->reginfo.primary;
1501 nbytes = (offset + size < lp->buffer_size) ?
1502 size : lp->buffer_size - offset;
1503 memcpy(dblp->bufp + offset, buf, nbytes);
1505 memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
1509 * __log_set_version --
1510 * Sets the current version of the log subsystem to the given version.
1511 * Essentially this modifies the lp->persist.version field in the
1512 * shared memory region. Called when region is initially created
1513 * and when replication is starting up or finds a new master.
1515 * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
1518 __log_set_version(env, newver)
1525 dblp = env->lg_handle;
1526 lp = (LOG *)dblp->reginfo.primary;
1528 * We should be able to update this atomically without locking.
1530 lp->persist.version = newver;
1534 * __log_get_oldversion --
1535 * Returns the last version of log that this environment was working
1536 * with. Since there could be several versions of log files, if
1537 * the user upgraded and didn't log archive, we check the version
1538 * of the first log file, compare it to the last log file. If those
1539 * are different, then there is an older log existing, and we then
1540 * walk backward in the log files looking for the version of the
1541 * most recent older log file.
1543 * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
1546 __log_get_oldversion(env, ver)
1555 u_int32_t firstfnum, fnum, lastver, oldver;
1558 dblp = env->lg_handle;
1559 lp = dblp->reginfo.primary;
1563 oldver = DB_LOGVERSION;
1565 * If we're in-memory logs we're always the current version.
1567 if (lp->db_log_inmemory) {
1571 memset(&rec, 0, sizeof(rec));
1572 if ((ret = __log_cursor(env, &logc)) != 0)
1575 * Get the version numbers of the first and last log files.
1577 if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
1579 * If there is no log file, we'll get DB_NOTFOUND.
1580 * If we get that, set the version to the current.
1582 if (ret == DB_NOTFOUND)
1586 firstfnum = lsn.file;
1587 if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
1589 if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
1590 NULL, &oldver)) != 0)
1593 * If the first and last LSN are in the same file, then we
1594 * already have the version in oldver. Return it.
1596 if (firstfnum == lsn.file)
1600 * Otherwise they're in different files and we call __log_valid
1601 * to get the version numbers in both files.
1603 if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
1604 NULL, &lastver)) != 0)
1607 * If the version numbers are different, walk backward getting
1608 * the version of each log file until we find one that is
1609 * different than the last.
1611 if (oldver != lastver) {
1612 for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
1613 if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
1614 NULL, &oldver)) != 0)
1616 if (oldver != lastver)
1620 err: if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
1622 if (ret == 0 && ver != NULL)