2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996-2002
5 * Sleepycat Software. All rights reserved.
10 static const char revid[] = "Id: log_put.c,v 11.112 2002/09/10 02:39:26 bostic Exp ";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
16 #if TIME_WITH_SYS_TIME
33 #include "dbinc/crypto.h"
34 #include "dbinc/hmac.h"
35 #include "dbinc/log.h"
36 #include "dbinc/rep.h"
37 #include "dbinc/txn.h"
39 static int __log_encrypt_record __P((DB_ENV *, DBT *, HDR *, u_int32_t));
40 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
41 static int __log_flush_commit __P((DB_ENV *, const DB_LSN *, u_int32_t));
42 static int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
43 static int __log_newfh __P((DB_LOG *));
44 static int __log_put_next __P((DB_ENV *,
45 DB_LSN *, const DBT *, HDR *, DB_LSN *));
46 static int __log_putr __P((DB_LOG *,
47 DB_LSN *, const DBT *, u_int32_t, HDR *));
48 static int __log_write __P((DB_LOG *, void *, u_int32_t));
52 * Write a log record. This is the public interface, DB_ENV->log_put.
54 * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
57 __log_put(dbenv, lsnp, udbt, flags)
69 u_int32_t do_flush, op, writeonly;
70 int lock_held, need_free, ret;
74 ENV_REQUIRES_CONFIG(dbenv,
75 dbenv->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
77 /* Validate arguments. */
78 op = DB_OPFLAGS_MASK & flags;
79 if (op != 0 && op != DB_COMMIT)
80 return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
82 /* Check for allowed bit-flags. */
83 if (LF_ISSET(~(DB_OPFLAGS_MASK |
84 DB_FLUSH | DB_NOCOPY | DB_PERMANENT | DB_WRNOSYNC)))
85 return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
87 /* DB_WRNOSYNC and DB_FLUSH are mutually exclusive. */
88 if (LF_ISSET(DB_WRNOSYNC) && LF_ISSET(DB_FLUSH))
89 return (__db_ferr(dbenv, "DB_ENV->log_put", 1));
91 /* Replication clients should never write log records. */
92 if (F_ISSET(dbenv, DB_ENV_REP_CLIENT) ||
93 F_ISSET(dbenv, DB_ENV_REP_LOGSONLY)) {
95 "DB_ENV->log_put is illegal on replication clients");
99 dblp = dbenv->lg_handle;
100 lp = dblp->reginfo.primary;
101 db_cipher = dbenv->crypto_handle;
104 lock_held = need_free = 0;
105 do_flush = LF_ISSET(DB_FLUSH);
106 writeonly = LF_ISSET(DB_WRNOSYNC);
109 * If we are coming from the logging code, we use an internal
110 * flag, DB_NOCOPY, because we know we can overwrite/encrypt
111 * the log record in place. Otherwise, if a user called log_put
112 * then we must copy it to new memory so that we know we can
115 * We also must copy it to new memory if we are a replication
116 * master so that we retain an unencrypted copy of the log
117 * record to send to clients.
119 if (!LF_ISSET(DB_NOCOPY) || F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
120 if (CRYPTO_ON(dbenv))
121 t.size += db_cipher->adj_size(udbt->size);
122 if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
125 memcpy(t.data, udbt->data, udbt->size);
127 if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, udbt->size)) != 0)
129 if (CRYPTO_ON(dbenv))
130 key = db_cipher->mac_key;
133 /* Otherwise, we actually have a record to put. Put it. */
135 /* Before we grab the region lock, calculate the record's checksum. */
136 __db_chksum(dbt->data, dbt->size, key, hdr.chksum);
138 R_LOCK(dbenv, &dblp->reginfo);
142 if ((ret = __log_put_next(dbenv, &lsn, dbt, &hdr, &old_lsn)) != 0)
145 if (F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
147 * Replication masters need to drop the lock to send
148 * messages, but we want to drop and reacquire it a minimal
151 R_UNLOCK(dbenv, &dblp->reginfo);
155 * If we changed files and we're in a replicated
156 * environment, we need to inform our clients now that
157 * we've dropped the region lock.
159 * Note that a failed NEWFILE send is a dropped message
160 * that our client can handle, so we can ignore it. It's
161 * possible that the record we already put is a commit, so
162 * we don't just want to return failure.
164 if (!IS_ZERO_LSN(old_lsn))
165 (void)__rep_send_message(dbenv,
166 DB_EID_BROADCAST, REP_NEWFILE, &old_lsn, NULL, 0);
169 * Then send the log record itself on to our clients.
171 * If the send fails and we're a commit or checkpoint,
172 * there's nothing we can do; the record's in the log.
173 * Flush it, even if we're running with TXN_NOSYNC, on the
174 * grounds that it should be in durable form somewhere.
178 * In the crypto case, we MUST send the udbt, not the
179 * now-encrypted dbt. Clients have no way to decrypt
180 * without the header.
182 if ((__rep_send_message(dbenv,
183 DB_EID_BROADCAST, REP_LOG, &lsn, udbt, flags) != 0) &&
184 LF_ISSET(DB_PERMANENT))
185 do_flush |= DB_FLUSH;
189 * If needed, do a flush. Note that failures at this point
190 * are only permissible if we know we haven't written a commit
191 * record; __log_flush_commit is responsible for enforcing this.
193 * If a flush is not needed, see if WRITE_NOSYNC was set and we
194 * need to write out the log buffer.
196 if (do_flush || writeonly) {
198 R_LOCK(dbenv, &dblp->reginfo);
202 ret = __log_flush_commit(dbenv, &lsn, flags);
203 else if (lp->b_off != 0)
205 * writeonly: if there's anything in the current
206 * log buffer, we need to write it out.
208 if ((ret = __log_write(dblp,
209 dblp->bufp, (u_int32_t)lp->b_off)) == 0)
214 R_UNLOCK(dbenv, &dblp->reginfo);
216 __os_free(dbenv, dbt->data);
227 * PUBLIC: void __log_txn_lsn
228 * PUBLIC: __P((DB_ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
231 __log_txn_lsn(dbenv, lsnp, mbytesp, bytesp)
234 u_int32_t *mbytesp, *bytesp;
239 dblp = dbenv->lg_handle;
240 lp = dblp->reginfo.primary;
242 R_LOCK(dbenv, &dblp->reginfo);
245 * We are trying to get the LSN of the last entry in the log. We use
246 * this in two places: 1) DB_ENV->txn_checkpiont uses it as a first
247 * value when trying to compute an LSN such that all transactions begun
248 * before it are complete. 2) DB_ENV->txn_begin uses it as the
251 * Typically, it's easy to get the last written LSN, you simply look
252 * at the current log pointer and back up the number of bytes of the
253 * last log record. However, if the last thing we did was write the
254 * log header of a new log file, then, this doesn't work, so we return
255 * the first log record that will be written in this new file.
258 if (lp->lsn.offset > lp->len)
259 lsnp->offset -= lp->len;
262 * Since we're holding the log region lock, return the bytes put into
263 * the log since the last checkpoint, transaction checkpoint needs it.
265 * We add the current buffer offset so as to count bytes that have not
266 * yet been written, but are sitting in the log buffer.
268 if (mbytesp != NULL) {
269 *mbytesp = lp->stat.st_wc_mbytes;
270 *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
272 lp->stat.st_wc_mbytes = lp->stat.st_wc_bytes = 0;
275 R_UNLOCK(dbenv, &dblp->reginfo);
280 * Put the given record as the next in the log, wherever that may
284 __log_put_next(dbenv, lsn, dbt, hdr, old_lsnp)
296 dblp = dbenv->lg_handle;
297 lp = dblp->reginfo.primary;
300 * Save a copy of lp->lsn before we might decide to switch log
301 * files and change it. If we do switch log files, and we're
302 * doing replication, we'll need to tell our clients about the
303 * switch, and they need to receive a NEWFILE message
304 * with this "would-be" LSN in order to know they're not
305 * missing any log records.
311 * If this information won't fit in the file, or if we're a
312 * replication client environment and have been told to do so,
315 if (lp->lsn.offset == 0 ||
316 lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
317 if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
319 "DB_ENV->log_put: record larger than maximum file size");
323 if ((ret = __log_newfile(dblp, NULL)) != 0)
327 * Flag that we switched files, in case we're a master
328 * and need to send this information to our clients.
329 * We postpone doing the actual send until we can
330 * safely release the log region lock and are doing so
337 * The offset into the log file at this point is the LSN where
338 * we're about to put this record, and is the LSN the caller wants.
342 /* If we switched log files, let our caller know where. */
346 /* Actually put the record. */
347 return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
351 * __log_flush_commit --
352 * Flush a record for which the DB_FLUSH flag to log_put has been set.
355 __log_flush_commit(dbenv, lsnp, flags)
366 dblp = dbenv->lg_handle;
367 lp = dblp->reginfo.primary;
369 op = DB_OPFLAGS_MASK & flags;
371 if ((ret = __log_flush_int(dblp, &flush_lsn, 1)) == 0)
375 * If a flush supporting a transaction commit fails, we must abort the
376 * transaction. (If we aren't doing a commit, return the failure; if
377 * if the commit we care about made it to disk successfully, we just
378 * ignore the failure, because there's no way to undo the commit.)
383 if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
387 * Else, make sure that the commit record does not get out after we
388 * abort the transaction. Do this by overwriting the commit record
389 * in the buffer. (Note that other commits in this buffer will wait
390 * wait until a sucessful write happens, we do not wake them.) We
391 * point at the right part of the buffer and write an abort record
392 * over the commit. We must then try and flush the buffer again,
393 * since the interesting part of the buffer may have actually made
394 * it out to disk before there was a failure, we can't know for sure.
396 if (__txn_force_abort(dbenv,
397 dblp->bufp + flush_lsn.offset - lp->w_off) == 0)
398 (void)__log_flush_int(dblp, &flush_lsn, 0);
405 * Initialize and switch to a new log file. (Note that this is
406 * called both when no log yet exists and when we fill a log file.)
408 * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *));
411 __log_newfile(dblp, lsnp)
415 DB_CIPHER *db_cipher;
427 lp = dblp->reginfo.primary;
429 /* If we're not at the beginning of a file already, start a new one. */
430 if (lp->lsn.offset != 0) {
432 * Flush the log so this file is out and can be closed. We
433 * cannot release the region lock here because we need to
434 * protect the end of the file while we switch. In
435 * particular, a thread with a smaller record than ours
436 * could detect that there is space in the log. Even
437 * blocking that event by declaring the file full would
438 * require all threads to wait here so that the lsn.file
439 * can be moved ahead after the flush completes. This
440 * probably can be changed if we had an lsn for the
441 * previous file and one for the curent, but it does not
442 * seem like this would get much more throughput, if any.
444 if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
447 DB_ASSERT(lp->b_off == 0);
449 * Save the last known offset from the previous file, we'll
450 * need it to initialize the persistent header information.
452 lastoff = lp->lsn.offset;
454 /* Point the current LSN to the new file. */
458 /* Reset the file write offset. */
464 * Insert persistent information as the first record in every file.
465 * Note that the previous length is wrong for the very first record
466 * of the log, but that's okay, we check for it during retrieval.
468 DB_ASSERT(lp->b_off == 0);
470 memset(&t, 0, sizeof(t));
471 memset(&hdr, 0, sizeof(HDR));
474 tsize = sizeof(LOGP);
475 db_cipher = dbenv->crypto_handle;
476 if (CRYPTO_ON(dbenv))
477 tsize += db_cipher->adj_size(tsize);
478 if ((ret = __os_calloc(dbenv, 1, tsize, &tmp)) != 0)
480 lp->persist.log_size = lp->log_size = lp->log_nsize;
481 memcpy(tmp, &lp->persist, sizeof(LOGP));
483 t.size = (u_int32_t)tsize;
487 __log_encrypt_record(dbenv, &t, &hdr, (u_int32_t)tsize)) != 0)
489 __db_chksum(t.data, t.size,
490 (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
492 if ((ret = __log_putr(dblp, &lsn,
493 &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
496 /* Update the LSN information returned to the caller. */
502 __os_free(dbenv, tmp);
508 * Actually put a record into the log.
511 __log_putr(dblp, lsn, dbt, prev, h)
518 DB_CIPHER *db_cipher;
528 lp = dblp->reginfo.primary;
531 * If we weren't given a header, use a local one.
533 db_cipher = dbenv->crypto_handle;
536 memset(hdr, 0, sizeof(HDR));
537 if (CRYPTO_ON(dbenv))
538 hdr->size = HDR_CRYPTO_SZ;
540 hdr->size = HDR_NORMAL_SZ;
544 /* Save our position in case we fail. */
550 * Initialize the header. If we just switched files, lsn.offset will
551 * be 0, and what we really want is the offset of the previous record
552 * in the previous file. Fortunately, prev holds the value we want.
555 hdr->len = (u_int32_t)hdr->size + dbt->size;
558 * If we were passed in a nonzero checksum, our caller calculated
559 * the checksum before acquiring the log mutex, as an optimization.
561 * If our caller calculated a real checksum of 0, we'll needlessly
562 * recalculate it. C'est la vie; there's no out-of-bounds value
565 if (hdr->chksum[0] == 0)
566 __db_chksum(dbt->data, dbt->size,
567 (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL,
570 if ((ret = __log_fill(dblp, lsn, hdr, (u_int32_t)hdr->size)) != 0)
573 if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
576 lp->len = (u_int32_t)(hdr->size + dbt->size);
577 lp->lsn.offset += (u_int32_t)(hdr->size + dbt->size);
581 * If we wrote more than one buffer before failing, get the
582 * first one back. The extra buffers will fail the checksums
585 if (w_off + lp->buffer_size < lp->w_off) {
588 &dblp->lfh, 0, 0, w_off, 0, DB_OS_SEEK_SET)) != 0 ||
589 (t_ret = __os_read(dbenv, &dblp->lfh, dblp->bufp,
591 return (__db_panic(dbenv, t_ret));
593 __db_err(dbenv, "Short read while restoring log");
594 return (__db_panic(dbenv, EIO));
598 /* Reset to where we started. */
608 * Write all records less than or equal to the specified LSN.
610 * PUBLIC: int __log_flush __P((DB_ENV *, const DB_LSN *));
613 __log_flush(dbenv, lsn)
621 ENV_REQUIRES_CONFIG(dbenv,
622 dbenv->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
624 dblp = dbenv->lg_handle;
625 R_LOCK(dbenv, &dblp->reginfo);
626 ret = __log_flush_int(dblp, lsn, 1);
627 R_UNLOCK(dbenv, &dblp->reginfo);
633 * Write all records less than or equal to the specified LSN; internal
637 __log_flush_int(dblp, lsnp, release)
643 DB_LSN flush_lsn, f_lsn;
644 DB_MUTEX *flush_mutexp;
646 int current, do_flush, first, ret;
648 struct __db_commit *commit;
649 u_int32_t ncommit, w_off;
654 lp = dblp->reginfo.primary;
655 flush_mutexp = R_ADDR(&dblp->reginfo, lp->flush_mutex_off);
658 * If no LSN specified, flush the entire log by setting the flush LSN
659 * to the last LSN written in the log. Otherwise, check that the LSN
660 * isn't a non-existent record for the log.
663 flush_lsn.file = lp->lsn.file;
664 flush_lsn.offset = lp->lsn.offset - lp->len;
665 } else if (lsnp->file > lp->lsn.file ||
666 (lsnp->file == lp->lsn.file &&
667 lsnp->offset > lp->lsn.offset - lp->len)) {
669 "DB_ENV->log_flush: LSN past current end-of-log");
673 * See if we need to wait. s_lsn is not locked so some
674 * care is needed. The sync point can only move forward.
675 * If the file we want is in the past we are done.
676 * If the file numbers are the same check the offset.
677 * If this fails check the file numbers again since the
678 * offset might have changed while we were looking.
679 * This all assumes we can read an integer in one
680 * state or the other, not in transition.
682 if (lp->s_lsn.file > lsnp->file)
685 if (lp->s_lsn.file == lsnp->file &&
686 lp->s_lsn.offset > lsnp->offset)
689 if (lp->s_lsn.file > lsnp->file)
696 * If a flush is in progress and we're allowed to do so, drop
697 * the region lock and block waiting for the next flush.
699 if (release && lp->in_flush != 0) {
700 if ((commit = SH_TAILQ_FIRST(
701 &lp->free_commits, __db_commit)) == NULL) {
703 __db_shalloc(dblp->reginfo.addr,
704 sizeof(struct __db_commit),
705 MUTEX_ALIGN, &commit)) != 0)
707 memset(commit, 0, sizeof(*commit));
708 if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo,
709 &commit->mutex, MUTEX_SELF_BLOCK |
710 MUTEX_NO_RLOCK)) != 0) {
711 __db_shalloc_free(dblp->reginfo.addr, commit);
714 MUTEX_LOCK(dbenv, &commit->mutex);
717 &lp->free_commits, commit, links, __db_commit);
722 * Flushes may be requested out of LSN order; be
723 * sure we only move lp->t_lsn forward.
725 if (log_compare(&lp->t_lsn, &flush_lsn) < 0)
726 lp->t_lsn = flush_lsn;
728 commit->lsn = flush_lsn;
729 SH_TAILQ_INSERT_HEAD(
730 &lp->commits, commit, links, __db_commit);
731 R_UNLOCK(dbenv, &dblp->reginfo);
732 /* Wait here for the in-progress flush to finish. */
733 MUTEX_LOCK(dbenv, &commit->mutex);
734 R_LOCK(dbenv, &dblp->reginfo);
738 * Grab the flag before freeing the struct to see if
739 * we need to flush the log to commit. If so,
740 * use the maximal lsn for any committing thread.
742 do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
743 F_CLR(commit, DB_COMMIT_FLUSH);
744 SH_TAILQ_INSERT_HEAD(
745 &lp->free_commits, commit, links, __db_commit);
748 flush_lsn = lp->t_lsn;
754 * Protect flushing with its own mutex so we can release
755 * the region lock except during file switches.
757 flush: MUTEX_LOCK(dbenv, flush_mutexp);
760 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
761 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
762 * after the byte we absolutely know was written to disk, so the test
765 if (flush_lsn.file < lp->s_lsn.file ||
766 (flush_lsn.file == lp->s_lsn.file &&
767 flush_lsn.offset < lp->s_lsn.offset)) {
768 MUTEX_UNLOCK(dbenv, flush_mutexp);
773 * We may need to write the current buffer. We have to write the
774 * current buffer if the flush LSN is greater than or equal to the
775 * buffer's starting LSN.
778 if (lp->b_off != 0 && log_compare(&flush_lsn, &lp->f_lsn) >= 0) {
779 if ((ret = __log_write(dblp,
780 dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
781 MUTEX_UNLOCK(dbenv, flush_mutexp);
790 * It's possible that this thread may never have written to this log
791 * file. Acquire a file descriptor if we don't already have one.
792 * One last check -- if we're not writing anything from the current
793 * buffer, don't bother. We have nothing to write and nothing to
796 if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
797 if (!current || (ret = __log_newfh(dblp)) != 0) {
798 MUTEX_UNLOCK(dbenv, flush_mutexp);
803 * We are going to flush, release the region.
804 * First get the current state of the buffer since
805 * another write may come in, but we may not flush it.
812 R_UNLOCK(dbenv, &dblp->reginfo);
814 /* Sync all writes to disk. */
815 if ((ret = __os_fsync(dbenv, &dblp->lfh)) != 0) {
816 MUTEX_UNLOCK(dbenv, flush_mutexp);
818 R_LOCK(dbenv, &dblp->reginfo);
819 ret = __db_panic(dbenv, ret);
824 * Set the last-synced LSN.
825 * This value must be set to the LSN past the last complete
826 * record that has been flushed. This is at least the first
827 * lsn, f_lsn. If the buffer is empty, b_off == 0, then
828 * we can move up to write point since the first lsn is not
829 * set for the new buffer.
833 lp->s_lsn.offset = w_off;
835 MUTEX_UNLOCK(dbenv, flush_mutexp);
837 R_LOCK(dbenv, &dblp->reginfo);
840 ++lp->stat.st_scount;
843 * How many flush calls (usually commits) did this call actually sync?
844 * At least one, if it got here.
848 if (lp->ncommit != 0) {
850 for (commit = SH_TAILQ_FIRST(&lp->commits, __db_commit);
852 commit = SH_TAILQ_NEXT(commit, links, __db_commit))
853 if (log_compare(&lp->s_lsn, &commit->lsn) > 0) {
854 MUTEX_UNLOCK(dbenv, &commit->mutex);
856 &lp->commits, commit, links, __db_commit);
858 } else if (first == 1) {
859 F_SET(commit, DB_COMMIT_FLUSH);
860 MUTEX_UNLOCK(dbenv, &commit->mutex);
862 &lp->commits, commit, links, __db_commit);
864 * This thread will wake and flush.
865 * If another thread commits and flushes
866 * first we will waste a trip trough the
873 if (lp->stat.st_maxcommitperflush < ncommit)
874 lp->stat.st_maxcommitperflush = ncommit;
875 if (lp->stat.st_mincommitperflush > ncommit ||
876 lp->stat.st_mincommitperflush == 0)
877 lp->stat.st_mincommitperflush = ncommit;
884 * Write information into the log.
887 __log_fill(dblp, lsn, addr, len)
894 u_int32_t bsize, nrec;
898 lp = dblp->reginfo.primary;
899 bsize = lp->buffer_size;
901 while (len > 0) { /* Copy out the data. */
903 * If we're beginning a new buffer, note the user LSN to which
904 * the first byte of the buffer belongs. We have to know this
905 * when flushing the buffer so that we know if the in-memory
906 * buffer needs to be flushed.
912 * If we're on a buffer boundary and the data is big enough,
913 * copy as many records as we can directly from the data.
915 if (lp->b_off == 0 && len >= bsize) {
917 if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
919 addr = (u_int8_t *)addr + nrec * bsize;
921 ++lp->stat.st_wcount_fill;
925 /* Figure out how many bytes we can copy this time. */
926 remain = bsize - lp->b_off;
927 nw = remain > len ? len : remain;
928 memcpy(dblp->bufp + lp->b_off, addr, nw);
929 addr = (u_int8_t *)addr + nw;
930 len -= (u_int32_t)nw;
933 /* If we fill the buffer, flush it. */
934 if (lp->b_off == bsize) {
935 if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
938 ++lp->stat.st_wcount_fill;
946 * Write the log buffer to disk.
949 __log_write(dblp, addr, len)
960 lp = dblp->reginfo.primary;
963 * If we haven't opened the log file yet or the current one
964 * has changed, acquire a new log file.
966 if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
967 if ((ret = __log_newfh(dblp)) != 0)
971 * Seek to the offset in the file (someone may have written it
972 * since we last did).
976 &dblp->lfh, 0, 0, lp->w_off, 0, DB_OS_SEEK_SET)) != 0 ||
977 (ret = __os_write(dbenv, &dblp->lfh, addr, len, &nw)) != 0)
980 /* Reset the buffer offset and update the seek offset. */
983 /* Update written statistics. */
984 if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
985 lp->stat.st_w_bytes -= MEGABYTE;
986 ++lp->stat.st_w_mbytes;
988 if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
989 lp->stat.st_wc_bytes -= MEGABYTE;
990 ++lp->stat.st_wc_mbytes;
992 ++lp->stat.st_wcount;
999 * Map a DB_LSN to a file name.
1001 * PUBLIC: int __log_file __P((DB_ENV *, const DB_LSN *, char *, size_t));
1004 __log_file(dbenv, lsn, namep, len)
1015 ENV_REQUIRES_CONFIG(dbenv,
1016 dbenv->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
1018 dblp = dbenv->lg_handle;
1019 R_LOCK(dbenv, &dblp->reginfo);
1020 ret = __log_name(dblp, lsn->file, &name, NULL, 0);
1021 R_UNLOCK(dbenv, &dblp->reginfo);
1025 /* Check to make sure there's enough room and copy the name. */
1026 if (len < strlen(name) + 1) {
1028 __db_err(dbenv, "DB_ENV->log_file: name buffer is too short");
1031 (void)strcpy(namep, name);
1032 __os_free(dbenv, name);
1039 * Acquire a file handle for the current log file.
1050 dbenv = dblp->dbenv;
1051 lp = dblp->reginfo.primary;
1053 /* Close any previous file descriptor. */
1054 if (F_ISSET(&dblp->lfh, DB_FH_VALID))
1055 (void)__os_closehandle(dbenv, &dblp->lfh);
1058 * Get the path of the new file and open it.
1060 * Adding DB_OSO_LOG to the flags may add additional platform-specific
1061 * optimizations. On WinNT, the logfile is preallocated, which may
1062 * have a time penalty at startup, but have better overall throughput.
1063 * We are not certain that this works reliably, so enable at your own
1067 * Initialize the log file size. This is a hack to push the log's
1068 * maximum size down into the Windows __os_open routine, because it
1069 * wants to pre-allocate it.
1071 dblp->lfname = lp->lsn.file;
1072 dblp->lfh.log_size = lp->log_size;
1073 if ((ret = __log_name(dblp, dblp->lfname,
1075 DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ |
1076 (F_ISSET(dbenv, DB_ENV_DIRECT_LOG) ? DB_OSO_DIRECT : 0))) != 0)
1078 "DB_ENV->log_put: %s: %s", name, db_strerror(ret));
1080 __os_free(dbenv, name);
1086 * Return the log name for a particular file, and optionally open it.
1088 * PUBLIC: int __log_name __P((DB_LOG *,
1089 * PUBLIC: u_int32_t, char **, DB_FH *, u_int32_t));
1092 __log_name(dblp, filenumber, namep, fhp, flags)
1094 u_int32_t filenumber, flags;
1102 char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
1104 dbenv = dblp->dbenv;
1105 lp = dblp->reginfo.primary;
1109 * The semantics of this routine are bizarre.
1111 * The reason for all of this is that we need a place where we can
1112 * intercept requests for log files, and, if appropriate, check for
1113 * both the old-style and new-style log file names. The trick is
1114 * that all callers of this routine that are opening the log file
1115 * read-only want to use an old-style file name if they can't find
1116 * a match using a new-style name. The only down-side is that some
1117 * callers may check for the old-style when they really don't need
1118 * to, but that shouldn't mess up anything, and we only check for
1119 * the old-style name when we've already failed to find a new-style
1122 * Create a new-style file name, and if we're not going to open the
1123 * file, return regardless.
1125 (void)snprintf(new, sizeof(new), LFNAME, filenumber);
1126 if ((ret = __db_appname(dbenv,
1127 DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhp == NULL)
1130 /* Open the new-style file -- if we succeed, we're done. */
1131 if ((ret = __os_open(dbenv, *namep, flags, lp->persist.mode, fhp)) == 0)
1135 * The open failed... if the DB_RDONLY flag isn't set, we're done,
1136 * the caller isn't interested in old-style files.
1138 if (!LF_ISSET(DB_OSO_RDONLY)) {
1140 "%s: log file open failed: %s", *namep, db_strerror(ret));
1141 return (__db_panic(dbenv, ret));
1144 /* Create an old-style file name. */
1145 (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
1146 if ((ret = __db_appname(dbenv, DB_APP_LOG, old, 0, NULL, &oname)) != 0)
1150 * Open the old-style file -- if we succeed, we're done. Free the
1151 * space allocated for the new-style name and return the old-style
1152 * name to the caller.
1154 if ((ret = __os_open(dbenv,
1155 oname, flags, lp->persist.mode, fhp)) == 0) {
1156 __os_free(dbenv, *namep);
1162 * Couldn't find either style of name -- return the new-style name
1163 * for the caller's error message. If it's an old-style name that's
1164 * actually missing we're going to confuse the user with the error
1165 * message, but that implies that not only were we looking for an
1166 * old-style name, but we expected it to exist and we weren't just
1167 * looking for any log file. That's not a likely error.
1169 err: __os_free(dbenv, oname);
1175 * Short-circuit way for replication clients to put records into the
1176 * log. Replication clients' logs need to be laid out exactly their masters'
1177 * are, so we let replication take responsibility for when the log gets
1178 * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper
1179 * for __log_putr with a slightly prettier interface.
1181 * Note that the log region mutex should be held when this is called.
1183 * PUBLIC: int __log_rep_put __P((DB_ENV *, DB_LSN *, const DBT *));
1186 __log_rep_put(dbenv, lsnp, rec)
1191 DB_CIPHER *db_cipher;
1198 dblp = dbenv->lg_handle;
1199 lp = dblp->reginfo.primary;
1201 memset(&hdr, 0, sizeof(HDR));
1205 db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
1206 if (CRYPTO_ON(dbenv))
1207 t.size += db_cipher->adj_size(rec->size);
1208 if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
1211 memcpy(t.data, rec->data, rec->size);
1213 if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, rec->size)) != 0)
1215 __db_chksum(t.data, t.size,
1216 (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
1218 DB_ASSERT(log_compare(lsnp, &lp->lsn) == 0);
1219 ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
1222 __os_free(dbenv, t.data);
1227 __log_encrypt_record(dbenv, dbt, hdr, orig)
1233 DB_CIPHER *db_cipher;
1236 if (CRYPTO_ON(dbenv)) {
1237 db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
1238 hdr->size = HDR_CRYPTO_SZ;
1239 hdr->orig_size = orig;
1240 if ((ret = db_cipher->encrypt(dbenv, db_cipher->data,
1241 hdr->iv, dbt->data, dbt->size)) != 0)
1244 hdr->size = HDR_NORMAL_SZ;