2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998, 1999, 2000
5 * Sleepycat Software. All rights reserved.
11 static const char copyright[] =
12 "Copyright (c) 1996-2000\nSleepycat Software Inc. All rights reserved.\n";
13 static const char revid[] =
14 "$Id: env_recover.c,v 11.33 2001/01/04 22:38:42 ubell Exp $";
17 #ifndef NO_SYSTEM_INCLUDES
18 #include <sys/types.h>
20 #if TIME_WITH_SYS_TIME
36 #include "db_dispatch.h"
41 static float __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
42 static int __log_earliest __P((DB_ENV *, int32_t *, DB_LSN *));
48 * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t));
51 __db_apprec(dbenv, flags)
56 DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, open_lsn;
58 __txn_ckp_args *ckp_args;
62 int is_thread, progress, ret;
65 COMPQUIET(nfiles, (float)0);
68 * Save the state of the thread flag -- we don't need it on at the
69 * moment because we're single-threaded until recovery is complete.
71 is_thread = F_ISSET(dbenv, DB_ENV_THREAD) ? 1 : 0;
72 F_CLR(dbenv, DB_ENV_THREAD);
73 F_SET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
76 * If the user is specifying recover to a particular point in time,
77 * verify that the logs present are sufficient to do this.
80 if (dbenv->tx_timestamp != 0) {
81 if ((ret = __log_earliest(dbenv, &low, &lowlsn)) != 0)
83 if ((int32_t)dbenv->tx_timestamp < low) {
86 strcpy(t1, ctime(&dbenv->tx_timestamp));
88 strcpy(t2, ctime(&tlow));
90 "Invalid recovery timestamp %.*s; earliest time is %.*s",
96 /* Initialize the transaction list. */
97 if ((ret = __db_txnlist_init(dbenv, &txninfo)) != 0)
101 * Recovery is done in three passes:
103 * We need to find the position from which we will open files
104 * We need to open files beginning with the last to next
105 * checkpoint because we might have crashed after writing the
106 * last checkpoint record, but before having written out all
107 * the open file information.
110 * Read forward through the log from the second to last checkpoint
111 * opening and closing files so that at the end of the log we have
112 * the "current" set of files open.
115 * Read backward through the log undoing any uncompleted TXNs.
116 * There are three cases:
117 * 1. If doing catastrophic recovery, we read to the beginning
119 * 2. If we are doing normal reovery, then we have to roll
120 * back to the most recent checkpoint that occurs
121 * before the most recent checkpoint LSN, which is
122 * returned by __log_findckp().
123 * 3. If we are recovering to a point in time, then we have
124 * to roll back to the checkpoint whose ckp_lsn is earlier
125 * than the specified time. __log_earliest will figure
127 * In case 2, "uncompleted TXNs" include all those who commited
128 * after the user's specified timestamp.
131 * Read forward through the log from the LSN found in pass #2,
132 * redoing any committed TXNs (which commited after any user-
133 * specified rollback point). During this pass, checkpoint
134 * file information is ignored, and file openings and closings
139 * Find out the last lsn, so that we can estimate how far along we
140 * are in recovery. This will help us determine how much log there
141 * is between the first LSN that we're going to be working with and
142 * the last one. We assume that each of the three phases takes the
143 * same amount of time (a false assumption) and then use the %-age
144 * of the amount of log traversed to figure out how much of the
145 * pass we've accomplished.
147 memset(&data, 0, sizeof(data));
148 if (dbenv->db_feedback != NULL &&
149 (ret = log_get(dbenv, &last_lsn, &data, DB_LAST)) != 0)
154 * Find the second to last checkpoint in the log. This is the point
155 * from which we want to begin pass #1 (the open files pass).
159 if (LF_ISSET(DB_RECOVER_FATAL)) {
160 if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) {
161 if (ret == DB_NOTFOUND)
164 __db_err(dbenv, "First log record not found");
169 log_get(dbenv, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) {
171 * If we don't find a checkpoint, start from the beginning.
172 * If that fails, we're done. Note, we do not require that
173 * there be log records if we're performing recovery.
175 first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) {
176 if (ret == DB_NOTFOUND)
179 __db_err(dbenv, "First log record not found");
183 } else if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) {
184 __db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n",
185 (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
187 } else if (IS_ZERO_LSN(ckp_args->last_ckp) ||
188 (ret = log_get(dbenv, &ckp_args->last_ckp, &data, DB_SET)) != 0)
191 open_lsn = ckp_args->last_ckp;
193 if (dbenv->db_feedback != NULL) {
194 if (last_lsn.file == open_lsn.file)
195 nfiles = (float)(last_lsn.offset - open_lsn.offset) /
198 nfiles = (float)(last_lsn.file - open_lsn.file) +
199 (float)(dbenv->lg_max - open_lsn.offset +
200 last_lsn.offset) / dbenv->lg_max;
201 /* We are going to divide by nfiles; make sure it isn't 0. */
203 nfiles = (float)0.001;
208 * Now, ckp_lsn is either the lsn of the last checkpoint
209 * or the lsn of the first record in the log. Open_lsn is
210 * the second to last checkpoint or the beinning of the log;
211 * begin the open files pass from that lsn, and proceed to
212 * the end of the log.
216 if (dbenv->db_feedback != NULL) {
217 progress = (int)(33 * (__lsn_diff(&open_lsn,
218 &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles));
219 dbenv->db_feedback(dbenv, DB_RECOVER, progress);
221 ret = __db_dispatch(dbenv,
222 &data, &lsn, DB_TXN_OPENFILES, txninfo);
223 if (ret != 0 && ret != DB_TXN_CKP)
225 if ((ret = log_get(dbenv, &lsn, &data, DB_NEXT)) != 0) {
226 if (ret == DB_NOTFOUND)
235 * Before we can begin pass #2, backward roll phase, we determine how
236 * far back in the log to recover. If we are doing catastrophic
237 * recovery, then we go as far back as we have files. If we are
238 * doing normal recovery, we go as back to the most recent checkpoint
239 * that occurs before the most recent checkpoint LSN. If we are
240 * recovering to a point in time, then rollback to the checkpoint whose
241 * ckp_lsn precedes the first log record (and then roll forward to
242 * the appropriate timestamp in Pass #3).
244 if (LF_ISSET(DB_RECOVER_FATAL)) {
246 } else if (dbenv->tx_timestamp != 0)
249 if ((ret = __log_findckp(dbenv, &first_lsn)) == DB_NOTFOUND) {
251 * We don't require that log files exist if recovery
258 if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
259 __db_err(dbenv, "Recovery starting from [%lu][%lu]",
260 (u_long)first_lsn.file, (u_long)first_lsn.offset);
262 for (ret = log_get(dbenv, &lsn, &data, DB_LAST);
263 ret == 0 && log_compare(&lsn, &first_lsn) > 0;
264 ret = log_get(dbenv, &lsn, &data, DB_PREV)) {
265 if (dbenv->db_feedback != NULL) {
266 progress = 34 + (int)(33 * (__lsn_diff(&open_lsn,
267 &last_lsn, &lsn, dbenv->lg_max, 0) / nfiles));
268 dbenv->db_feedback(dbenv, DB_RECOVER, progress);
270 ret = __db_dispatch(dbenv,
271 &data, &lsn, DB_TXN_BACKWARD_ROLL, txninfo);
273 if (ret != DB_TXN_CKP)
279 if (ret != 0 && ret != DB_NOTFOUND)
285 for (ret = log_get(dbenv, &lsn, &data, DB_NEXT);
286 ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) {
287 if (dbenv->db_feedback != NULL) {
288 progress = 67 + (int)(33 * (__lsn_diff(&open_lsn,
289 &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles));
290 dbenv->db_feedback(dbenv, DB_RECOVER, progress);
292 ret = __db_dispatch(dbenv,
293 &data, &lsn, DB_TXN_FORWARD_ROLL, txninfo);
295 if (ret != DB_TXN_CKP)
301 if (ret != DB_NOTFOUND)
305 * Process any pages that were on the limbo list
306 * and move them to the free list. Do this
307 * before checkpointing the database.
309 if ((ret = __db_do_the_limbo(dbenv, txninfo)) != 0)
313 * Now set the last checkpoint lsn and the current time,
314 * take a checkpoint, and reset the txnid.
317 region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary;
318 region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
319 region->last_ckp = ckp_lsn;
320 region->time_ckp = (u_int32_t)now;
323 * Take two checkpoints so that we don't re-recover any of the
324 * work we've already done.
326 if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0)
329 /* Now close all the db files that are open. */
330 __log_close_files(dbenv);
332 if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0)
334 region->last_txnid = TXN_MINIMUM;
336 if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
337 __db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
338 __db_err(dbenv, "%s %lx %s [%lu][%lu]",
339 "Maximum transaction ID",
340 ((DB_TXNHEAD *)txninfo)->maxid,
341 "Recovery checkpoint",
342 (u_long)region->last_ckp.file,
343 (u_long)region->last_ckp.offset);
347 msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed",
348 (u_long)lsn.file, (u_long)lsn.offset);
352 F_SET(dbenv, DB_ENV_THREAD);
353 __db_txnlist_end(dbenv, txninfo);
354 if (ckp_args != NULL)
355 __os_free(ckp_args, sizeof(*ckp_args));
356 F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
358 dbenv->tx_timestamp = 0;
363 * Figure out how many logfiles we have processed. If we are moving
364 * forward (is_forward != 0), then we're computing current - low. If
365 * we are moving backward, we are computing high - current. max is
366 * the number of bytes per logfile.
369 __lsn_diff(low, high, current, max, is_forward)
370 DB_LSN *low, *high, *current;
377 * There are three cases in each direction. If you are in the
378 * same file, then all you need worry about is the difference in
379 * offsets. If you are in different files, then either your offsets
380 * put you either more or less than the integral difference in the
381 * number of files -- we need to handle both of these.
384 if (current->file == low->file)
385 nf = (float)(current->offset - low->offset) / max;
386 else if (current->offset < low->offset)
387 nf = (float)(current->file - low->file - 1) +
388 (float)(max - low->offset + current->offset) / max;
390 nf = (float)(current->file - low->file) +
391 (float)(current->offset - low->offset) / max;
393 if (current->file == high->file)
394 nf = (float)(high->offset - current->offset) / max;
395 else if (current->offset > high->offset)
396 nf = (float)(high->file - current->file - 1) +
397 (float)(max - current->offset + high->offset) / max;
399 nf = (float)(high->file - current->file) +
400 (float)(high->offset - current->offset) / max;
408 * Return the earliest recovery point for the log files present. The
409 * earliest recovery time is the time stamp of the first checkpoint record
410 * whose checkpoint LSN is greater than the first LSN we process.
413 __log_earliest(dbenv, lowtime, lowlsn)
418 DB_LSN first_lsn, lsn;
420 __txn_ckp_args *ckpargs;
424 memset(&data, 0, sizeof(data));
426 * Read forward through the log looking for the first checkpoint
427 * record whose ckp_lsn is greater than first_lsn.
430 for (ret = log_get(dbenv, &first_lsn, &data, DB_FIRST);
431 ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) {
434 memcpy(&rectype, data.data, sizeof(rectype));
435 if (rectype != DB_txn_ckp)
437 if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) {
438 cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn);
439 *lowlsn = ckpargs->ckp_lsn;
440 *lowtime = ckpargs->timestamp;
442 __os_free(ckpargs, 0);