txn/txn_chkpt.c

   1 /*-
   2  * See the file LICENSE for redistribution information.
   3  *
   4  * Copyright (c) 1996-2009 Oracle.  All rights reserved.
   5  */
   6 /*
   7  * Copyright (c) 1995, 1996
   8  *      The President and Fellows of Harvard University.  All rights reserved.
   9  *
  10  * This code is derived from software contributed to Berkeley by
  11  * Margo Seltzer.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. Neither the name of the University nor the names of its contributors
  22  *    may be used to endorse or promote products derived from this software
  23  *    without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  * $Id$
  38  */
  39
  40 #include "db_config.h"
  41
  42 #include "db_int.h"
  43 #include "dbinc/log.h"
  44 #include "dbinc/mp.h"
  45 #include "dbinc/txn.h"
  46
  47 /*
  48  * __txn_checkpoint_pp --
  49  *      ENV->txn_checkpoint pre/post processing.
  50  *
  51  * PUBLIC: int __txn_checkpoint_pp
  52  * PUBLIC:     __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
  53  */
  54 int
  55 __txn_checkpoint_pp(dbenv, kbytes, minutes, flags)
  56         DB_ENV *dbenv;
  57         u_int32_t kbytes, minutes, flags;
  58 {
  59         DB_THREAD_INFO *ip;
  60         ENV *env;
  61         int ret;
  62
  63         env = dbenv->env;
  64
  65         ENV_REQUIRES_CONFIG(env,
  66             env->tx_handle, "txn_checkpoint", DB_INIT_TXN);
  67
  68         /*
  69          * On a replication client, all transactions are read-only; therefore,
  70          * a checkpoint is a null-op.
  71          *
  72          * We permit txn_checkpoint, instead of just rendering it illegal,
  73          * so that an application can just let a checkpoint thread continue
  74          * to operate as it gets promoted or demoted between being a
  75          * master and a client.
  76          */
  77         if (IS_REP_CLIENT(env))
  78                 return (0);
  79
  80         ENV_ENTER(env, ip);
  81         REPLICATION_WRAP(env,
  82             (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret);
  83         ENV_LEAVE(env, ip);
  84         return (ret);
  85 }
  86
  87 /*
  88  * __txn_checkpoint --
  89  *      ENV->txn_checkpoint.
  90  *
  91  * PUBLIC: int __txn_checkpoint
  92  * PUBLIC:      __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
  93  */
  94 int
  95 __txn_checkpoint(env, kbytes, minutes, flags)
  96         ENV *env;
  97         u_int32_t kbytes, minutes, flags;
  98 {
  99         DB_LSN ckp_lsn, last_ckp;
 100         DB_TXNMGR *mgr;
 101         DB_TXNREGION *region;
 102         REGENV *renv;
 103         REGINFO *infop;
 104         time_t last_ckp_time, now;
 105         u_int32_t bytes, id, logflags, mbytes, op;
 106         int ret;
 107
 108         ret = 0;
 109
 110         /*
 111          * A client will only call through here during recovery,
 112          * so just sync the Mpool and go home.  We want to be sure
 113          * that since queue meta pages are not rolled back that they
 114          * are clean in the cache prior to any transaction log
 115          * truncation due to syncup.
 116          */
 117         if (IS_REP_CLIENT(env)) {
 118                 if (MPOOL_ON(env) &&
 119                     (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) {
 120                         __db_err(env, ret,
 121                     "txn_checkpoint: failed to flush the buffer cache");
 122                         return (ret);
 123                 }
 124                 return (0);
 125         }
 126
 127         mgr = env->tx_handle;
 128         region = mgr->reginfo.primary;
 129         infop = env->reginfo;
 130         renv = infop->primary;
 131         /*
 132          * No mutex is needed as envid is read-only once it is set.
 133          */
 134         id = renv->envid;
 135
 136         /*
 137          * The checkpoint LSN is an LSN such that all transactions begun before
 138          * it are complete.  Our first guess (corrected below based on the list
 139          * of active transactions) is the last-written LSN.
 140          */
 141         if ((ret = __log_current_lsn(env, &ckp_lsn, &mbytes, &bytes)) != 0)
 142                 return (ret);
 143
 144         if (!LF_ISSET(DB_FORCE)) {
 145                 /* Don't checkpoint a quiescent database. */
 146                 if (bytes == 0 && mbytes == 0)
 147                         return (0);
 148
 149                 /*
 150                  * If either kbytes or minutes is non-zero, then only take the
 151                  * checkpoint if more than "minutes" minutes have passed or if
 152                  * more than "kbytes" of log data have been written since the
 153                  * last checkpoint.
 154                  */
 155                 if (kbytes != 0 &&
 156                     mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes)
 157                         goto do_ckp;
 158
 159                 if (minutes != 0) {
 160                         (void)time(&now);
 161
 162                         TXN_SYSTEM_LOCK(env);
 163                         last_ckp_time = region->time_ckp;
 164                         TXN_SYSTEM_UNLOCK(env);
 165
 166                         if (now - last_ckp_time >= (time_t)(minutes * 60))
 167                                 goto do_ckp;
 168                 }
 169
 170                 /*
 171                  * If we checked time and data and didn't go to checkpoint,
 172                  * we're done.
 173                  */
 174                 if (minutes != 0 || kbytes != 0)
 175                         return (0);
 176         }
 177
 178         /*
 179          * We must single thread checkpoints otherwise the chk_lsn may get out
 180          * of order.  We need to capture the start of the earliest currently
 181          * active transaction (chk_lsn) and then flush all buffers.  While
 182          * doing this we we could then be overtaken by another checkpoint that
 183          * sees a later chk_lsn but competes first.  An archive process could
 184          * then remove a log this checkpoint depends on.
 185          */
 186 do_ckp:
 187         MUTEX_LOCK(env, region->mtx_ckp);
 188         if ((ret = __txn_getactive(env, &ckp_lsn)) != 0)
 189                 goto err;
 190
 191         /*
 192          * Checkpoints in replication groups can cause performance problems.
 193          *
 194          * As on the master, checkpoint on the replica requires the cache be
 195          * flushed.  The problem occurs when a client has dirty cache pages
 196          * to write when the checkpoint record arrives, and the client's PERM
 197          * response is necessary in order to meet the system's durability
 198          * guarantees.  In this case, the master will have to wait until the
 199          * client completes its cache flush and writes the checkpoint record
 200          * before subsequent transactions can be committed.  The delay may
 201          * cause transactions to timeout waiting on client response, which
 202          * can cause nasty ripple effects in the system's overall throughput.
 203          * [#15338]
 204          *
 205          * First, we send a start-sync record when the checkpoint starts so
 206          * clients can start flushing their cache in preparation for the
 207          * arrival of the checkpoint record.
 208          */
 209         if (LOGGING_ON(env) && IS_REP_MASTER(env)) {
 210 #ifdef HAVE_REPLICATION_THREADS
 211                 /*
 212                  * If repmgr is configured in the shared environment (which we
 213                  * know if we have a local host address), but no send() function
 214                  * configured for this process, assume we have a
 215                  * replication-unaware process that wants to automatically
 216                  * participate in replication (i.e., sending replication
 217                  * messages to clients).
 218                  */
 219                 if (env->rep_handle->send == NULL &&
 220                     F_ISSET(env, ENV_THREAD) &&
 221                     env->rep_handle->region->my_addr.host != INVALID_ROFF &&
 222                     (ret = __repmgr_autostart(env)) != 0)
 223                         goto err;
 224 #endif
 225                 if (env->rep_handle->send != NULL)
 226                         (void)__rep_send_message(env, DB_EID_BROADCAST,
 227                             REP_START_SYNC, &ckp_lsn, NULL, 0, 0);
 228         }
 229
 230         /* Flush the cache. */
 231         if (MPOOL_ON(env) &&
 232             (ret = __memp_sync_int(
 233                 env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) {
 234                 __db_err(env, ret,
 235                     "txn_checkpoint: failed to flush the buffer cache");
 236                 goto err;
 237         }
 238
 239         /*
 240          * The client won't have more dirty pages to flush from its cache than
 241          * the master did, but there may be differences between the hardware,
 242          * I/O configuration and workload on the master and the client that
 243          * can result in the client being unable to finish its cache flush as
 244          * fast as the master.  A way to avoid the problem is to pause after
 245          * the master completes its checkpoint and before the actual checkpoint
 246          * record is logged, giving the replicas additional time to finish.
 247          *
 248          * !!!
 249          * Currently turned off when testing, because it makes the test suite
 250          * take a long time to run.
 251          */
 252 #ifndef CONFIG_TEST
 253         if (LOGGING_ON(env) &&
 254             IS_REP_MASTER(env) && env->rep_handle->send != NULL &&
 255             !LF_ISSET(DB_CKP_INTERNAL) &&
 256             env->rep_handle->region->chkpt_delay != 0)
 257                 __os_yield(env, 0, env->rep_handle->region->chkpt_delay);
 258 #endif
 259
 260         /*
 261          * Because we can't be a replication client here, and because
 262          * recovery (somewhat unusually) calls txn_checkpoint and expects
 263          * it to write a log message, LOGGING_ON is the correct macro here.
 264          */
 265         if (LOGGING_ON(env)) {
 266                 TXN_SYSTEM_LOCK(env);
 267                 last_ckp = region->last_ckp;
 268                 TXN_SYSTEM_UNLOCK(env);
 269                 /*
 270                  * Put out records for the open files before we log
 271                  * the checkpoint.  The records are certain to be at
 272                  * or after ckp_lsn, but before the checkpoint record
 273                  * itself, so they're sure to be included if we start
 274                  * recovery from the ckp_lsn contained in this
 275                  * checkpoint.
 276                  */
 277                 logflags = DB_LOG_CHKPNT;
 278                 /*
 279                  * If this is a normal checkpoint, log files as checkpoints.
 280                  * If we are recovering, only log as DBREG_RCLOSE if
 281                  * there are no prepared txns.  Otherwise, it should
 282                  * stay as DBREG_CHKPNT.
 283                  */
 284                 op = DBREG_CHKPNT;
 285                 if (!IS_RECOVERING(env))
 286                         logflags |= DB_FLUSH;
 287                 else if (region->stat.st_nrestores == 0)
 288                         op = DBREG_RCLOSE;
 289                 if ((ret = __dbreg_log_files(env, op)) != 0 ||
 290                     (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags,
 291                     &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) {
 292                         __db_err(env, ret,
 293                             "txn_checkpoint: log failed at LSN [%ld %ld]",
 294                             (long)ckp_lsn.file, (long)ckp_lsn.offset);
 295                         goto err;
 296                 }
 297
 298                 if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0)
 299                         goto err;
 300         }
 301
 302 err:    MUTEX_UNLOCK(env, region->mtx_ckp);
 303         return (ret);
 304 }
 305
 306 /*
 307  * __txn_getactive --
 308  *       Find the oldest active transaction and figure out its "begin" LSN.
 309  *       This is the lowest LSN we can checkpoint, since any record written
 310  *       after it may be involved in a transaction and may therefore need
 311  *       to be undone in the case of an abort.
 312  *
 313  *       We check both the file and offset for 0 since the lsn may be in
 314  *       transition.  If it is then we don't care about this txn because it
 315  *       must be starting after we set the initial value of lsnp in the caller.
 316  *       All txns must initalize their begin_lsn before writing to the log.
 317  *
 318  * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *));
 319  */
 320 int
 321 __txn_getactive(env, lsnp)
 322         ENV *env;
 323         DB_LSN *lsnp;
 324 {
 325         DB_TXNMGR *mgr;
 326         DB_TXNREGION *region;
 327         TXN_DETAIL *td;
 328
 329         mgr = env->tx_handle;
 330         region = mgr->reginfo.primary;
 331
 332         TXN_SYSTEM_LOCK(env);
 333         SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
 334                 if (td->begin_lsn.file != 0 &&
 335                     td->begin_lsn.offset != 0 &&
 336                     LOG_COMPARE(&td->begin_lsn, lsnp) < 0)
 337                         *lsnp = td->begin_lsn;
 338         TXN_SYSTEM_UNLOCK(env);
 339
 340         return (0);
 341 }
 342
 343 /*
 344  * __txn_getckp --
 345  *      Get the LSN of the last transaction checkpoint.
 346  *
 347  * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *));
 348  */
 349 int
 350 __txn_getckp(env, lsnp)
 351         ENV *env;
 352         DB_LSN *lsnp;
 353 {
 354         DB_LSN lsn;
 355         DB_TXNMGR *mgr;
 356         DB_TXNREGION *region;
 357
 358         mgr = env->tx_handle;
 359         region = mgr->reginfo.primary;
 360
 361         TXN_SYSTEM_LOCK(env);
 362         lsn = region->last_ckp;
 363         TXN_SYSTEM_UNLOCK(env);
 364
 365         if (IS_ZERO_LSN(lsn))
 366                 return (DB_NOTFOUND);
 367
 368         *lsnp = lsn;
 369         return (0);
 370 }
 371
 372 /*
 373  * __txn_updateckp --
 374  *      Update the last_ckp field in the transaction region.  This happens
 375  * at the end of a normal checkpoint and also when a replication client
 376  * receives a checkpoint record.
 377  *
 378  * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *));
 379  */
 380 int
 381 __txn_updateckp(env, lsnp)
 382         ENV *env;
 383         DB_LSN *lsnp;
 384 {
 385         DB_TXNMGR *mgr;
 386         DB_TXNREGION *region;
 387
 388         mgr = env->tx_handle;
 389         region = mgr->reginfo.primary;
 390
 391         /*
 392          * We want to make sure last_ckp only moves forward;  since we drop
 393          * locks above and in log_put, it's possible for two calls to
 394          * __txn_ckp_log to finish in a different order from how they were
 395          * called.
 396          */
 397         TXN_SYSTEM_LOCK(env);
 398         if (LOG_COMPARE(&region->last_ckp, lsnp) < 0) {
 399                 region->last_ckp = *lsnp;
 400                 (void)time(&region->time_ckp);
 401         }
 402         TXN_SYSTEM_UNLOCK(env);
 403
 404         return (0);
 405 }