fileops/fop_util.c

   1 /*-
   2  * See the file LICENSE for redistribution information.
   3  *
   4  * Copyright (c) 2001, 2010 Oracle and/or its affiliates.  All rights reserved.
   5  *
   6  * $Id$
   7  */
   8
   9 #include "db_config.h"
  10
  11 #include "db_int.h"
  12 #include "dbinc/db_page.h"
  13 #include "dbinc/db_am.h"
  14 #include "dbinc/hash.h"
  15 #include "dbinc/fop.h"
  16 #include "dbinc/lock.h"
  17 #include "dbinc/mp.h"
  18 #include "dbinc/log.h"
  19 #include "dbinc/txn.h"
  20
  21 static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
  22 static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
  23 static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
  24 static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t));
  25 static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
  26                const char *, const char *, const char *, DB_LOCKER *));
  27 static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
  28 static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
  29              const char *, const char *, const char *, DB_LOCKER *));
  30
  31 /*
  32  * Acquire the environment meta-data lock.  The parameters are the
  33  * environment (ENV), the locker id to use in acquiring the lock (ID)
  34  * and a pointer to a DB_LOCK.
  35  *
  36  * !!!
  37  * Turn off locking for Critical Path.  The application must do its own
  38  * synchronization of open/create.  Two threads creating and opening a
  39  * file at the same time may have unpredictable results.
  40  */
  41 #ifdef CRITICALPATH_10266
  42 #define GET_ENVLOCK(ENV, ID, L) (0)
  43 #else
  44 #define GET_ENVLOCK(ENV, ID, L) do {                                    \
  45         DBT __dbt;                                                      \
  46         u_int32_t __lockval;                                            \
  47                                                                         \
  48         if (LOCKING_ON((ENV))) {                                        \
  49                 __lockval = 1;                                          \
  50                 __dbt.data = &__lockval;                                \
  51                 __dbt.size = sizeof(__lockval);                         \
  52                 if ((ret = __lock_get((ENV), (ID),                      \
  53                     0, &__dbt, DB_LOCK_WRITE, (L))) != 0)               \
  54                         goto err;                                       \
  55         }                                                               \
  56 } while (0)
  57 #endif
  58
  59 #define RESET_MPF(D, F) do {                                            \
  60         (void)__memp_fclose((D)->mpf, (F));                             \
  61         (D)->mpf = NULL;                                                \
  62         F_CLR((D), DB_AM_OPEN_CALLED);                                  \
  63         if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0)           \
  64                 goto err;                                               \
  65 } while (0)
  66
  67 /*
  68  * If we open a file handle and our caller is doing fcntl(2) locking,
  69  * we can't close the handle because that would discard the caller's
  70  * lock. Save it until we close or refresh the DB handle.
  71  */
  72 #define CLOSE_HANDLE(D, F) {                                            \
  73         if ((F) != NULL) {                                              \
  74                 if (LF_ISSET(DB_FCNTL_LOCKING))                         \
  75                         (D)->saved_open_fhp = (F);                      \
  76                 else if ((t_ret =                                       \
  77                     __os_closehandle((D)->env, (F))) != 0) {            \
  78                         if (ret == 0)                                   \
  79                                 ret = t_ret;                            \
  80                         goto err;                                       \
  81                 }                                                       \
  82                 (F) = NULL;                                             \
  83         }                                                               \
  84 }
  85
  86 /*
  87  * __fop_lock_handle --
  88  *
  89  * Get the handle lock for a database.  If the envlock is specified, do this
  90  * as a lock_vec call that releases the environment lock before acquiring the
  91  * handle lock.
  92  *
  93  * PUBLIC: int __fop_lock_handle __P((ENV *,
  94  * PUBLIC:     DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
  95  *
  96  */
  97 int
  98 __fop_lock_handle(env, dbp, locker, mode, elockp, flags)
  99         ENV *env;
 100         DB *dbp;
 101         DB_LOCKER *locker;
 102         db_lockmode_t mode;
 103         DB_LOCK *elockp;
 104         u_int32_t flags;
 105 {
 106         DBT fileobj;
 107         DB_LOCKREQ reqs[2], *ereq;
 108         DB_LOCK_ILOCK lock_desc;
 109         int ret;
 110
 111         if (!LOCKING_ON(env) ||
 112             F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
 113                 return (0);
 114
 115         /*
 116          * If we are in recovery, the only locking we should be
 117          * doing is on the global environment.
 118          */
 119         if (IS_RECOVERING(env))
 120                 return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
 121
 122         memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
 123         lock_desc.pgno = dbp->meta_pgno;
 124         lock_desc.type = DB_HANDLE_LOCK;
 125
 126         memset(&fileobj, 0, sizeof(fileobj));
 127         fileobj.data = &lock_desc;
 128         fileobj.size = sizeof(lock_desc);
 129         DB_TEST_SUBLOCKS(env, flags);
 130         if (elockp == NULL)
 131                 ret = __lock_get(env, locker,
 132                     flags, &fileobj, mode, &dbp->handle_lock);
 133         else {
 134                 reqs[0].op = DB_LOCK_PUT;
 135                 reqs[0].lock = *elockp;
 136                 reqs[1].op = DB_LOCK_GET;
 137                 reqs[1].mode = mode;
 138                 reqs[1].obj = &fileobj;
 139                 reqs[1].timeout = 0;
 140                 if ((ret = __lock_vec(env,
 141                     locker, flags, reqs, 2, &ereq)) == 0) {
 142                         dbp->handle_lock = reqs[1].lock;
 143                         LOCK_INIT(*elockp);
 144                 } else if (ereq != reqs)
 145                         LOCK_INIT(*elockp);
 146         }
 147
 148         dbp->cur_locker = locker;
 149         return (ret);
 150 }
 151
 152 /*
 153  * __fop_file_setup --
 154  *
 155  * Perform all the needed checking and locking to open up or create a
 156  * file.
 157  *
 158  * There's a reason we don't push this code down into the buffer cache.
 159  * The problem is that there's no information external to the file that
 160  * we can use as a unique ID.  UNIX has dev/inode pairs, but they are
 161  * not necessarily unique after reboot, if the file was mounted via NFS.
 162  * Windows has similar problems, as the FAT filesystem doesn't maintain
 163  * dev/inode numbers across reboot.  So, we must get something from the
 164  * file we can use to ensure that, even after a reboot, the file we're
 165  * joining in the cache is the right file for us to join.  The solution
 166  * we use is to maintain a file ID that's stored in the database, and
 167  * that's why we have to open and read the file before calling into the
 168  * buffer cache or obtaining a lock (we use this unique fileid to lock
 169  * as well as to identify like files in the cache).
 170  *
 171  * There are a couple of idiosyncrasies that this code must support, in
 172  * particular, DB_TRUNCATE and DB_FCNTL_LOCKING.  First, we disallow
 173  * DB_TRUNCATE in the presence of transactions, since opening a file with
 174  * O_TRUNC will result in data being lost in an unrecoverable fashion.
 175  * We also disallow DB_TRUNCATE if locking is enabled, because even in
 176  * the presence of locking, we cannot avoid race conditions, so allowing
 177  * DB_TRUNCATE with locking would be misleading.  See SR [#7345] for more
 178  * details.
 179  *
 180  * However, if you are running with neither locking nor transactions, then
 181  * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
 182  * regardless of its contents.
 183  *
 184  * FCNTL locking introduces another set of complications.  First, the only
 185  * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
 186  * with programs like Sendmail and Postfix.  In these cases, the caller may
 187  * already have a lock on the file; we need to make sure that any file handles
 188  * we open remain open, because if we were to close them, the lock held by the
 189  * caller would go away.  Furthermore, Sendmail and/or Postfix need the ability
 190  * to create databases in empty files.  So, when you're doing FCNTL locking,
 191  * it's reasonable that you are trying to create a database into a 0-length
 192  * file and we allow it, while under normal conditions, we do not create
 193  * databases if the files already exist and are not Berkeley DB files.
 194  *
 195  * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
 196  * PUBLIC:     DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
 197  */
 198 int
 199 __fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
 200         DB *dbp;
 201         DB_THREAD_INFO *ip;
 202         DB_TXN *txn;
 203         const char *name;
 204         int mode;
 205         u_int32_t flags, *retidp;
 206 {
 207         DBTYPE save_type;
 208         DB_FH *fhp;
 209         DB_LOCK elock;
 210         DB_LOCKER *locker;
 211         DB_TXN *stxn;
 212         ENV *env;
 213         size_t len;
 214         u_int32_t dflags, oflags;
 215         u_int8_t mbuf[DBMETASIZE];
 216         int created_locker, create_ok, ret, retries, t_ret, tmp_created;
 217         int truncating, was_inval;
 218         char *real_name, *real_tmpname, *tmpname;
 219
 220         *retidp = TXN_INVALID;
 221
 222         env = dbp->env;
 223         fhp = NULL;
 224         LOCK_INIT(elock);
 225         stxn = NULL;
 226         created_locker = tmp_created = truncating = was_inval = 0;
 227         real_name = real_tmpname = tmpname = NULL;
 228         dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
 229
 230         ret = 0;
 231         retries = 0;
 232         save_type = dbp->type;
 233
 234         /*
 235          * Get a lockerid for this handle.  There are paths through queue
 236          * rename and remove where this dbp already has a locker, so make
 237          * sure we don't clobber it and conflict.
 238          */
 239         if (LOCKING_ON(env) &&
 240             !F_ISSET(dbp, DB_AM_COMPENSATE) &&
 241             !F_ISSET(dbp, DB_AM_RECOVER) &&
 242             dbp->locker == DB_LOCK_INVALIDID) {
 243                 if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
 244                         goto err;
 245                 created_locker = 1;
 246         }
 247         LOCK_INIT(dbp->handle_lock);
 248
 249         locker = txn == NULL ? dbp->locker : txn->locker;
 250
 251         oflags = 0;
 252         if (F_ISSET(dbp, DB_AM_INMEM))
 253                 real_name = (char *)name;
 254         else {
 255                 /* Get the real backing file name. */
 256                 if ((ret = __db_appname(env,
 257                     DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
 258                         goto err;
 259
 260                 /* Fill in the default file mode. */
 261                 if (mode == 0)
 262                         mode = DB_MODE_660;
 263
 264                 if (LF_ISSET(DB_RDONLY))
 265                         oflags |= DB_OSO_RDONLY;
 266                 if (LF_ISSET(DB_TRUNCATE))
 267                         oflags |= DB_OSO_TRUNC;
 268         }
 269
 270         retries = 0;
 271         create_ok = LF_ISSET(DB_CREATE);
 272         LF_CLR(DB_CREATE);
 273
 274 retry:
 275         /*
 276          * If we cannot create the file, only retry a few times.  We
 277          * think we might be in a race with another create, but it could
 278          * be that the backup filename exists (that is, is left over from
 279          * a previous crash).
 280          */
 281         if (++retries > DB_RETRY) {
 282                 __db_errx(env, "__fop_file_setup:  Retry limit (%d) exceeded",
 283                     DB_RETRY);
 284                 goto err;
 285         }
 286         if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
 287                 GET_ENVLOCK(env, locker, &elock);
 288         if (name == NULL)
 289                 ret = ENOENT;
 290         else if (F_ISSET(dbp, DB_AM_INMEM)) {
 291                 ret = __env_mpool(dbp, name, flags);
 292                 /*
 293                  * We are using __env_open as a check for existence.
 294                  * However, __env_mpool does an actual open and there
 295                  * are scenarios where the object exists, but cannot be
 296                  * opened, because our settings don't match those internally.
 297                  * We need to check for that explicitly.  We'll need the
 298                  * mpool open to read the meta-data page, so we're going to
 299                  * have to temporarily turn this dbp into an UNKNOWN one.
 300                  */
 301                 if (ret == EINVAL) {
 302                         was_inval = 1;
 303                         save_type = dbp->type;
 304                         dbp->type = DB_UNKNOWN;
 305                         ret = __env_mpool(dbp, name, flags);
 306                         dbp->type = save_type;
 307                 }
 308         } else
 309                 ret = __os_exists(env, real_name, NULL);
 310
 311         if (ret == 0) {
 312                 /*
 313                  * If the file exists, there are 5 possible cases:
 314                  * 1. DB_EXCL was specified so this is an error, unless
 315                  *      this is a file left around after a rename and we
 316                  *      are in the same transaction.  This gets decomposed
 317                  *      into several subcases, because we check for various
 318                  *      errors before we know we're in rename.
 319                  * 2. We are truncating, and it doesn't matter what kind
 320                  *      of file it is, we should open/create it.
 321                  * 3. It is 0-length, we are not doing transactions (i.e.,
 322                  *      we are sendmail), we should open/create into it.
 323                  *      -- on-disk files only!
 324                  * 4. Is it a Berkeley DB file and we should simply open it.
 325                  * 5. It is not a BDB file and we should return an error.
 326                  */
 327
 328                 /* Open file (if there is one). */
 329 reopen:         if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
 330                     __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
 331                         goto err;
 332
 333                 /* Case 2: DB_TRUNCATE: we must do the creation in place. */
 334                 if (LF_ISSET(DB_TRUNCATE)) {
 335                         if (LF_ISSET(DB_EXCL)) {
 336                                 /* Case 1a: DB_EXCL and DB_TRUNCATE. */
 337                                 ret = EEXIST;
 338                                 goto err;
 339                         }
 340                         tmpname = (char *)name;
 341                         goto creat2;
 342                 }
 343
 344                 /* Cases 1,3-5: we need to read the meta-data page. */
 345                 if (F_ISSET(dbp, DB_AM_INMEM))
 346                         ret = __fop_inmem_read_meta(dbp, txn, name, flags);
 347                 else {
 348                         ret = __fop_read_meta(env, real_name, mbuf,
 349                             sizeof(mbuf), fhp,
 350                             LF_ISSET(DB_NOERROR) ||
 351                             (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
 352                             &len);
 353
 354                         /* Case 3: 0-length, no txns. */
 355                         if (ret != 0 && len == 0 && txn == NULL) {
 356                                 if (LF_ISSET(DB_EXCL)) {
 357                                         /*
 358                                          * Case 1b: DB_EXCL and
 359                                          * 0-length file exists.
 360                                          */
 361                                         ret = EEXIST;
 362                                         goto err;
 363                                 }
 364                                 tmpname = (char *)name;
 365                                 if (create_ok)
 366                                         goto creat2;
 367                                 goto done;
 368                         }
 369
 370                         /* Case 4: This is a valid file. */
 371                         if (ret == 0)
 372                                 ret = __db_meta_setup(env, dbp, real_name,
 373                                     (DBMETA *)mbuf, flags, DB_CHK_META);
 374
 375                 }
 376
 377                 /* Case 5: Invalid file. */
 378                 if (ret != 0)
 379                         goto err;
 380
 381                 /* Now, get our handle lock. */
 382                 if ((ret = __fop_lock_handle(env,
 383                     dbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) == 0) {
 384                         if ((ret = __ENV_LPUT(env, elock)) != 0)
 385                                 goto err;
 386                 } else if (ret != DB_LOCK_NOTGRANTED ||
 387                     (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
 388                         goto err;
 389                 else {
 390                         /*
 391                          * We were unable to acquire the handle lock without
 392                          * blocking.  The fact that we are blocking might mean
 393                          * that someone else is trying to delete the file.
 394                          * Since some platforms cannot delete files while they
 395                          * are open (Windows), we are going to have to close
 396                          * the file.  This would be a problem if we were doing
 397                          * FCNTL locking, because our closing the handle would
 398                          * release the FCNTL locks.  Fortunately, if we are
 399                          * doing FCNTL locking, then we should never fail to
 400                          * acquire our handle lock, so we should never get here.
 401                          * We assert it here to make sure we aren't destroying
 402                          * any application level FCNTL semantics.
 403                          */
 404                         DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
 405                         if (!F_ISSET(dbp, DB_AM_INMEM)) {
 406                                 if ((ret = __os_closehandle(env, fhp)) != 0)
 407                                         goto err;
 408                                 fhp = NULL;
 409                         }
 410                         if ((ret = __fop_lock_handle(env,
 411                             dbp, locker, DB_LOCK_READ, &elock, 0)) != 0) {
 412                                 if (F_ISSET(dbp, DB_AM_INMEM))
 413                                         RESET_MPF(dbp, 0);
 414                                 goto err;
 415                         }
 416
 417                         /*
 418                          * If we had to wait, we might be waiting on a
 419                          * dummy file used in create/destroy of a database.
 420                          * To be sure we have the correct information we
 421                          * try again.
 422                          */
 423                         if ((ret = __db_refresh(dbp,
 424                             txn, DB_NOSYNC, NULL, 1)) != 0)
 425                                 goto err;
 426                         if ((ret =
 427                             __ENV_LPUT(env, dbp->handle_lock)) != 0) {
 428                                 LOCK_INIT(dbp->handle_lock);
 429                                 goto err;
 430                         }
 431                         goto retry;
 432
 433                 }
 434
 435                 /* If we got here, then we have the handle lock. */
 436
 437                 /*
 438                  * Check for a file in the midst of a rename.  If we find that
 439                  * the file is in the midst of a rename, it must be the case
 440                  * that it is in our current transaction (else we would still
 441                  * be blocking), so we can continue along and create a new file
 442                  * with the same name.  In that case, we have to close the file
 443                  * handle because we reuse it below.  This is a case where
 444                  * a 'was_inval' above is OK.
 445                  */
 446                 if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
 447                         was_inval = 0;
 448                         if (create_ok) {
 449                                 if (F_ISSET(dbp, DB_AM_INMEM)) {
 450                                         RESET_MPF(dbp, DB_MPOOL_DISCARD);
 451                                 } else if ((ret =
 452                                     __os_closehandle(env, fhp)) != 0)
 453                                         goto err;
 454                                 LF_SET(DB_CREATE);
 455                                 goto create;
 456                         } else {
 457                                 ret = ENOENT;
 458                                 goto err;
 459                         }
 460                 }
 461
 462                 /* If we get here, a was_inval is bad. */
 463                 if (was_inval) {
 464                         ret = EINVAL;
 465                         goto err;
 466                 }
 467
 468                 /*
 469                  * Now, case 1: check for DB_EXCL, because the file that exists
 470                  * is not in the middle of a rename, so we have an error.  This
 471                  * is a weird case, but we need to make sure that we don't
 472                  * continue to hold the handle lock, since technically, we
 473                  * should not have been allowed to open it.
 474                  */
 475                 if (LF_ISSET(DB_EXCL)) {
 476                         ret = __ENV_LPUT(env, dbp->handle_lock);
 477                         LOCK_INIT(dbp->handle_lock);
 478                         if (ret == 0)
 479                                 ret = EEXIST;
 480                         goto err;
 481                 }
 482                 goto done;
 483         }
 484
 485         /* File does not exist. */
 486 #ifdef  HAVE_VXWORKS
 487         /*
 488          * VxWorks can return file-system specific error codes if the
 489          * file does not exist, not ENOENT.
 490          */
 491         if (!create_ok)
 492 #else
 493         if (!create_ok || ret != ENOENT)
 494 #endif
 495                 goto err;
 496         LF_SET(DB_CREATE);
 497         ret = 0;
 498
 499         /*
 500          * We need to create file, which means that we need to set up the file,
 501          * the fileid and the locks.  Then we need to call the appropriate
 502          * routines to create meta-data pages.  For in-memory files, we retain
 503          * the environment lock, while for on-disk files, we drop the env lock
 504          * and create into a temporary.
 505          */
 506         if (!F_ISSET(dbp, DB_AM_INMEM) &&
 507             (ret = __ENV_LPUT(env, elock)) != 0)
 508                 goto err;
 509
 510 create: if (txn != NULL && IS_REP_CLIENT(env) &&
 511             !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
 512                 __db_errx(env,
 513                     "Transactional create on replication client disallowed");
 514                 ret = EINVAL;
 515                 goto err;
 516         }
 517
 518         if (F_ISSET(dbp, DB_AM_INMEM))
 519                 ret = __fop_inmem_create(dbp, name, txn, flags);
 520         else {
 521                 if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
 522                         goto err;
 523                 if (TXN_ON(env) && txn != NULL &&
 524                     (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
 525                         goto err;
 526                 if ((ret = __fop_create(env, stxn, &fhp,
 527                     tmpname, &dbp->dirname, DB_APP_DATA, mode, dflags)) != 0) {
 528                         /*
 529                          * If no transactions, there is a race on creating the
 530                          * backup file, as the backup file name is the same for
 531                          * all processes.  Wait for the other process to finish
 532                          * with the name.
 533                          */
 534                         if (!TXN_ON(env) && ret == EEXIST) {
 535                                 __os_free(env, tmpname);
 536                                 tmpname = NULL;
 537                                 __os_yield(env, 1, 0);
 538                                 goto retry;
 539                         }
 540                         goto err;
 541                 }
 542                 tmp_created = 1;
 543         }
 544
 545 creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) {
 546                 if ((ret = __db_appname(env,
 547                     DB_APP_DATA, tmpname, &dbp->dirname, &real_tmpname)) != 0)
 548                         goto err;
 549
 550                 /* Set the pagesize if it isn't yet set. */
 551                 if (dbp->pgsize == 0 &&
 552                     (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
 553                         goto errmsg;
 554
 555                 /* Construct a file_id. */
 556                 if ((ret =
 557                     __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
 558                         goto errmsg;
 559         }
 560
 561         if ((ret = __db_new_file(dbp, ip,
 562             F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
 563                 goto err;
 564
 565         /*
 566          * We need to close the handle here on platforms where remove and
 567          * rename fail if a handle is open (including Windows).
 568          */
 569         CLOSE_HANDLE(dbp, fhp);
 570
 571         /*
 572          * Now move the file into place unless we are creating in place (because
 573          * we created a database in a file that started out 0-length).  If
 574          * this is an in-memory file, we may or may not hold the environment
 575          * lock depending on how we got here.
 576          */
 577         if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
 578             !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
 579                 GET_ENVLOCK(env, locker, &elock);
 580
 581         if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
 582                 F_CLR(dbp, DB_AM_IN_RENAME);
 583                 __txn_remrem(env, txn, real_name);
 584         } else if (name == tmpname) {
 585                 /* We created it in place. */
 586         } else if (!F_ISSET(dbp, DB_AM_INMEM) &&
 587             __os_exists(env, real_name, NULL) == 0) {
 588                 /*
 589                  * Someone managed to create the file; remove our temp
 590                  * and try to open the file that now exists.
 591                  */
 592                 (void)__fop_remove(env, NULL,
 593                     dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA, dflags);
 594                 (void)__ENV_LPUT(env, dbp->handle_lock);
 595                 LOCK_INIT(dbp->handle_lock);
 596
 597                 if (stxn != NULL) {
 598                         ret = __txn_abort(stxn);
 599                         stxn = NULL;
 600                 }
 601                 if (ret != 0)
 602                         goto err;
 603                 goto reopen;
 604         }
 605
 606         if (name != NULL && (ret = __fop_lock_handle(env,
 607             dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0)
 608                 goto err;
 609         if (tmpname != NULL &&
 610             tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
 611             name, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
 612                 goto err;
 613         if ((ret = __ENV_LPUT(env, elock)) != 0)
 614                 goto err;
 615
 616         if (stxn != NULL) {
 617                 *retidp = stxn->txnid;
 618                 ret = __txn_commit(stxn, 0);
 619                 stxn = NULL;
 620         } else
 621                 *retidp = TXN_INVALID;
 622
 623         if (ret != 0)
 624                 goto err;
 625
 626         F_SET(dbp, DB_AM_CREATED);
 627
 628         if (0) {
 629 errmsg:         __db_err(env, ret, "%s", name);
 630
 631 err:            CLOSE_HANDLE(dbp, fhp);
 632                 if (stxn != NULL)
 633                         (void)__txn_abort(stxn);
 634                 if (tmp_created && txn == NULL)
 635                         (void)__fop_remove(env,
 636                             NULL, NULL, tmpname, NULL, DB_APP_DATA, dflags);
 637                 if (txn == NULL)
 638                         (void)__ENV_LPUT(env, dbp->handle_lock);
 639                 (void)__ENV_LPUT(env, elock);
 640                 if (created_locker) {
 641                         (void)__lock_id_free(env, dbp->locker);
 642                         dbp->locker = NULL;
 643                 }
 644         }
 645
 646 done:   /*
 647          * There are cases where real_name and tmpname take on the
 648          * exact same string, so we need to make sure that we do not
 649          * free twice.
 650          */
 651         if (!truncating && tmpname != NULL && tmpname != name)
 652                 __os_free(env, tmpname);
 653         if (real_name != name && real_name != NULL)
 654                 __os_free(env, real_name);
 655         if (real_tmpname != NULL)
 656                 __os_free(env, real_tmpname);
 657         CLOSE_HANDLE(dbp, fhp);
 658
 659         return (ret);
 660 }
 661
 662 /*
 663  * __fop_set_pgsize --
 664  *      Set the page size based on file information.
 665  */
 666 static int
 667 __fop_set_pgsize(dbp, fhp, name)
 668         DB *dbp;
 669         DB_FH *fhp;
 670         const char *name;
 671 {
 672         ENV *env;
 673         u_int32_t iopsize;
 674         int ret;
 675
 676         env = dbp->env;
 677
 678         /*
 679          * Use the filesystem's optimum I/O size as the pagesize if a pagesize
 680          * not specified.  Some filesystems have 64K as their optimum I/O size,
 681          * but as that results in fairly large default caches, we limit the
 682          * default pagesize to 16K.
 683          */
 684         if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
 685                 __db_err(env, ret, "%s", name);
 686                 return (ret);
 687         }
 688         if (iopsize < 512)
 689                 iopsize = 512;
 690         if (iopsize > 16 * 1024)
 691                 iopsize = 16 * 1024;
 692
 693         /*
 694          * Sheer paranoia, but we don't want anything that's not a power-of-2
 695          * (we rely on that for alignment of various types on the pages), and
 696          * we want a multiple of the sector size as well.  If the value
 697          * we got out of __os_ioinfo looks bad, use a default instead.
 698          */
 699         if (!IS_VALID_PAGESIZE(iopsize))
 700                 iopsize = DB_DEF_IOSIZE;
 701
 702         dbp->pgsize = iopsize;
 703         F_SET(dbp, DB_AM_PGDEF);
 704
 705         return (0);
 706 }
 707
 708 /*
 709  * __fop_subdb_setup --
 710  *
 711  * Subdb setup is significantly simpler than file setup.  In terms of
 712  * locking, for the duration of the operation/transaction, the locks on
 713  * the meta-data page will suffice to protect us from simultaneous operations
 714  * on the sub-database.  Before we complete the operation though, we'll get a
 715  * handle lock on the subdatabase so that on one else can try to remove it
 716  * while we've got it open.  We use an object that looks like the meta-data
 717  * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
 718  * locks.
 719  *
 720  * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
 721  * PUBLIC:     const char *, const char *, int, u_int32_t));
 722  */
 723 int
 724 __fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
 725         DB *dbp;
 726         DB_THREAD_INFO *ip;
 727         DB_TXN *txn;
 728         const char *mname, *name;
 729         int mode;
 730         u_int32_t flags;
 731 {
 732         DB *mdbp;
 733         ENV *env;
 734         db_lockmode_t lkmode;
 735         int ret, t_ret;
 736
 737         mdbp = NULL;
 738         env = dbp->env;
 739
 740         if ((ret = __db_master_open(dbp,
 741             ip, txn, mname, flags, mode, &mdbp)) != 0)
 742                 return (ret);
 743         /*
 744          * If we created this file, then we need to set the DISCARD flag so
 745          * that if we fail in the middle of this routine, we discard from the
 746          * mpool any pages that we just created.
 747          */
 748         if (F_ISSET(mdbp, DB_AM_CREATED))
 749                 F_SET(mdbp, DB_AM_DISCARD);
 750
 751         /*
 752          * We are going to close this instance of the master, so we can
 753          * steal its handle instead of reopening a handle on the database.
 754          */
 755         if (LF_ISSET(DB_FCNTL_LOCKING)) {
 756                 dbp->saved_open_fhp = mdbp->saved_open_fhp;
 757                 mdbp->saved_open_fhp = NULL;
 758         }
 759
 760         /* Copy the pagesize and set the sub-database flag. */
 761         dbp->pgsize = mdbp->pgsize;
 762         F_SET(dbp, DB_AM_SUBDB);
 763
 764         if (name != NULL && (ret = __db_master_update(mdbp, dbp,
 765             ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0)
 766                 goto err;
 767
 768         /*
 769          * Hijack the master's locker ID as well, so that our locks don't
 770          * conflict with the master's.  Since we're closing the master,
 771          * that locker would just have been freed anyway.  Once we've gotten
 772          * the locker id, we need to acquire the handle lock for this
 773          * subdatabase.
 774          */
 775         dbp->locker = mdbp->locker;
 776         mdbp->locker = NULL;
 777
 778         DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
 779
 780         /*
 781          * We copy our fileid from our master so that we all open
 782          * the same file in mpool.  We'll use the meta-pgno to lock
 783          * so that we end up with different handle locks.
 784          */
 785
 786         memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
 787         lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ?
 788             DB_LOCK_WRITE : DB_LOCK_READ;
 789         if ((ret = __fop_lock_handle(env, dbp,
 790             txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
 791             NOWAIT_FLAG(txn))) != 0)
 792                 goto err;
 793
 794         if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
 795                 /*
 796                  * If there was no transaction and we created this database,
 797                  * then we need to undo the update of the master database.
 798                  */
 799                 if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
 800                         (void)__db_master_update(mdbp, dbp,
 801                             ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
 802                 F_CLR(dbp, DB_AM_CREATED);
 803                 goto err;
 804         }
 805
 806         /*
 807          * XXX
 808          * This should have been done at the top of this routine.  The problem
 809          * is that __db_init_subdb() uses "standard" routines to process the
 810          * meta-data page and set information in the DB handle based on it.
 811          * Those routines have to deal with swapped pages and will normally set
 812          * the DB_AM_SWAP flag.  However, we use the master's metadata page and
 813          * that has already been swapped, so they get the is-swapped test wrong.
 814          */
 815         F_CLR(dbp, DB_AM_SWAP);
 816         F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
 817
 818         /*
 819          * In the file create case, these happen in separate places so we have
 820          * two different tests.  They end up in the same place for subdbs, but
 821          * for compatibility with file testing, we put them both here anyway.
 822          */
 823         DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
 824         DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
 825
 826         /*
 827          * File exists and we have the appropriate locks; we should now
 828          * process a normal open.
 829          */
 830         if (F_ISSET(mdbp, DB_AM_CREATED)) {
 831                 F_SET(dbp, DB_AM_CREATED_MSTR);
 832                 F_CLR(mdbp, DB_AM_DISCARD);
 833         }
 834
 835         if (0) {
 836 err:
 837 DB_TEST_RECOVERY_LABEL
 838                 if (txn == NULL)
 839                         (void)__ENV_LPUT(env, dbp->handle_lock);
 840         }
 841
 842         /*
 843          * The master's handle lock is under the control of the
 844          * subdb (it acquired the master's locker).  We want to
 845          * keep the master's handle lock so that no one can remove
 846          * the file while the subdb is open.  If we register the
 847          * trade event and then invalidate the copy of the lock
 848          * in the master's handle, that will accomplish this.  However,
 849          * before we register this event, we'd better remove any
 850          * events that we've already registered for the master.
 851          */
 852         if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
 853                 /* Unregister old master events. */
 854                  __txn_remlock(env,
 855                     txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
 856
 857                 /* Now register the new event. */
 858                 if ((t_ret = __txn_lockevent(env, txn, dbp,
 859                     &mdbp->handle_lock, dbp->locker == NULL ?
 860                     mdbp->locker : dbp->locker)) != 0 && ret == 0)
 861                         ret = t_ret;
 862         }
 863         LOCK_INIT(mdbp->handle_lock);
 864
 865         /*
 866          * If the master was created, we need to sync so that the metadata
 867          * page is correct on disk for recovery, since it isn't read through
 868          * mpool.  If we're opening a subdb in an existing file, we can skip
 869          * the sync.
 870          */
 871         if ((t_ret = __db_close(mdbp, txn,
 872             F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
 873                 ret = t_ret;
 874
 875         return (ret);
 876 }
 877
 878 /*
 879  * __fop_remove_setup --
 880  *      Open handle appropriately and lock for removal of a database file.
 881  *
 882  * PUBLIC: int __fop_remove_setup __P((DB *,
 883  * PUBLIC:      DB_TXN *, const char *, u_int32_t));
 884  */
 885 int
 886 __fop_remove_setup(dbp, txn, name, flags)
 887         DB *dbp;
 888         DB_TXN *txn;
 889         const char *name;
 890         u_int32_t flags;
 891 {
 892         DB_FH *fhp;
 893         DB_LOCK elock;
 894         ENV *env;
 895         u_int8_t mbuf[DBMETASIZE];
 896         int ret;
 897
 898         COMPQUIET(flags, 0);
 899
 900         env = dbp->env;
 901
 902         LOCK_INIT(elock);
 903         fhp = NULL;
 904         ret = 0;
 905
 906         /* Create locker if necessary. */
 907 retry:  if (LOCKING_ON(env)) {
 908                 if (txn != NULL)
 909                         dbp->locker = txn->locker;
 910                 else if (dbp->locker == DB_LOCK_INVALIDID) {
 911                         if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
 912                                 goto err;
 913                 }
 914         }
 915
 916         /*
 917          * We are about to open a file handle and then possibly close it.
 918          * We cannot close handles if we are doing FCNTL locking.  However,
 919          * there is no way to pass the FCNTL flag into this routine via the
 920          * user API.  The only way we can get in here and be doing FCNTL
 921          * locking is if we are trying to clean up an open that was called
 922          * with FCNTL locking.  In that case, the save_fhp should already be
 923          * set.  So, we use that field to tell us if we need to make sure
 924          * that we shouldn't close the handle.
 925          */
 926         fhp = dbp->saved_open_fhp;
 927         DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
 928
 929         /*
 930          * Lock environment to protect file open.  That will enable us to
 931          * read the meta-data page and get the fileid so that we can lock
 932          * the handle.
 933          */
 934         GET_ENVLOCK(env, dbp->locker, &elock);
 935
 936         /* Open database. */
 937         if (F_ISSET(dbp, DB_AM_INMEM)) {
 938                 if ((ret = __env_mpool(dbp, name, flags)) == 0)
 939                         ret = __os_strdup(env, name, &dbp->dname);
 940         } else if (fhp == NULL)
 941                 ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
 942         if (ret != 0)
 943                 goto err;
 944
 945         /* Get meta-data */
 946         if (F_ISSET(dbp, DB_AM_INMEM))
 947                 ret = __fop_inmem_read_meta(dbp, txn, name, flags);
 948         else if ((ret = __fop_read_meta(env,
 949             name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
 950                 ret = __db_meta_setup(env, dbp,
 951                     name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
 952         if (ret != 0)
 953                 goto err;
 954
 955         /*
 956          * Now, get the handle lock.  We first try with NOWAIT, because if
 957          * we have to wait, we're going to have to close the file and reopen
 958          * it, so that if there is someone else removing it, our open doesn't
 959          * prevent that.
 960          */
 961         if ((ret = __fop_lock_handle(env,
 962             dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
 963                 /*
 964                  * Close the file, block on the lock, clean up the dbp, and
 965                  * then start all over again.
 966                  */
 967                 if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
 968                         (void)__os_closehandle(env, fhp);
 969                         fhp = NULL;
 970                 }
 971                 if (ret != DB_LOCK_NOTGRANTED ||
 972                     (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
 973                         goto err;
 974                 else if ((ret = __fop_lock_handle(env,
 975                     dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
 976                         goto err;
 977
 978                 if (F_ISSET(dbp, DB_AM_INMEM)) {
 979                         (void)__lock_put(env, &dbp->handle_lock);
 980                         (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
 981                 } else {
 982                         if (txn != NULL)
 983                                 dbp->locker = NULL;
 984                         (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
 985                 }
 986                 goto retry;
 987         } else if ((ret = __ENV_LPUT(env, elock)) != 0)
 988                 goto err;
 989         else if (F_ISSET(dbp, DB_AM_IN_RENAME))
 990                 ret = ENOENT;
 991
 992         if (0) {
 993 err:            (void)__ENV_LPUT(env, elock);
 994         }
 995         if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
 996                 (void)__os_closehandle(env, fhp);
 997         /*
 998          * If this is a real file and we are going to proceed with the removal,
 999          * then we need to make sure that we don't leave any pages around in the
1000          * mpool since the file is closed and will be reopened again before
1001          * access.  However, this might be an in-memory file, in which case
1002          * we will handle the discard from the mpool later as it's the "real"
1003          * removal of the database.
1004          */
1005         if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
1006                 F_SET(dbp, DB_AM_DISCARD);
1007         return (ret);
1008 }
1009
1010 /*
1011  * __fop_read_meta --
1012  *      Read the meta-data page from a file and return it in buf.
1013  *
1014  * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
1015  * PUBLIC:     u_int8_t *, size_t, DB_FH *, int, size_t *));
1016  */
1017 int
1018 __fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
1019         ENV *env;
1020         const char *name;
1021         u_int8_t *buf;
1022         size_t size;
1023         DB_FH *fhp;
1024         int errok;
1025         size_t *nbytesp;
1026 {
1027         size_t nr;
1028         int ret;
1029
1030         /*
1031          * Our caller wants to know the number of bytes read, even if we
1032          * return an error.
1033          */
1034         if (nbytesp != NULL)
1035                 *nbytesp = 0;
1036
1037         nr = 0;
1038         ret = __os_read(env, fhp, buf, size, &nr);
1039         if (nbytesp != NULL)
1040                 *nbytesp = nr;
1041
1042         if (ret != 0) {
1043                 if (!errok)
1044                         __db_err(env, ret, "%s", name);
1045                 goto err;
1046         }
1047
1048         if (nr != size) {
1049                 if (!errok)
1050                         __db_errx(env,
1051                             "fop_read_meta: %s: unexpected file type or format",
1052                             name);
1053                 ret = EINVAL;
1054         }
1055
1056 err:
1057         return (ret);
1058 }
1059
1060 /*
1061  * __fop_dummy --
1062  *      This implements the creation and name swapping of dummy files that
1063  * we use for remove and rename (remove is simply a rename with a delayed
1064  * remove).
1065  *
1066  * PUBLIC: int __fop_dummy __P((DB *,
1067  * PUBLIC:     DB_TXN *, const char *, const char *));
1068  */
1069 int
1070 __fop_dummy(dbp, txn, old, new)
1071         DB *dbp;
1072         DB_TXN *txn;
1073         const char *old, *new;
1074 {
1075         DB *tmpdbp;
1076         DB_TXN *stxn;
1077         ENV *env;
1078         char *back;
1079         int ret, t_ret;
1080         u_int8_t mbuf[DBMETASIZE];
1081
1082         env = dbp->env;
1083         back = NULL;
1084         stxn = NULL;
1085         tmpdbp = NULL;
1086
1087         DB_ASSERT(env, txn != NULL);
1088
1089         /*
1090          * Begin sub transaction to encapsulate the rename.  Note that we
1091          * expect the inmem_swap calls to complete the sub-transaction,
1092          * aborting on error and committing on success.
1093          */
1094         if (TXN_ON(env) &&
1095             (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
1096                 goto err;
1097
1098         /* We need to create a dummy file as a place holder. */
1099         if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
1100                 goto err;
1101         /* Create a dummy dbp handle. */
1102         if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1103                 goto err;
1104         if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
1105                 (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
1106                 goto err;
1107         memset(mbuf, 0, sizeof(mbuf));
1108         ret = F_ISSET(dbp, DB_AM_INMEM) ?
1109             __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
1110             __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
1111
1112         if (ret != 0)
1113                 goto err;
1114
1115         ret = F_ISSET(dbp, DB_AM_INMEM) ?
1116             __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
1117             __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
1118         stxn = NULL;
1119         if (ret != 0)
1120                 goto err;
1121
1122 err:    if (stxn != NULL)
1123                 (void)__txn_abort(stxn);
1124         if (tmpdbp != NULL &&
1125             (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1126                 ret = t_ret;
1127         if (back != NULL)
1128                 __os_free(env, back);
1129         return (ret);
1130 }
1131
1132 /*
1133  * __fop_dbrename --
1134  *      Do the appropriate file locking and file system operations
1135  * to effect a dbrename in the absence of transactions (__fop_dummy
1136  * and the subsequent calls in __db_rename do the work for the
1137  * transactional case).
1138  *
1139  * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
1140  */
1141 int
1142 __fop_dbrename(dbp, old, new)
1143         DB *dbp;
1144         const char *old, *new;
1145 {
1146         DB_LOCK elock;
1147         ENV *env;
1148         char *real_new, *real_old;
1149         int ret, t_ret;
1150
1151         env = dbp->env;
1152         real_new = NULL;
1153         real_old = NULL;
1154         LOCK_INIT(elock);
1155
1156         if (F_ISSET(dbp, DB_AM_INMEM)) {
1157                 real_new = (char *)new;
1158                 real_old = (char *)old;
1159         } else {
1160                 /* Get full names. */
1161                 if ((ret = __db_appname(env,
1162                     DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
1163                         goto err;
1164
1165                 if ((ret = __db_appname(env,
1166                     DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
1167                         goto err;
1168         }
1169
1170         /*
1171          * It is an error to rename a file over one that already exists,
1172          * as that wouldn't be transaction-safe.  We check explicitly
1173          * for ondisk files, but it's done memp_nameop for in-memory ones.
1174          */
1175         GET_ENVLOCK(env, dbp->locker, &elock);
1176         ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
1177             __os_exists(env, real_new, NULL);
1178
1179         if (ret == 0) {
1180                 ret = EEXIST;
1181                 __db_errx(env, "rename: file %s exists", real_new);
1182                 goto err;
1183         }
1184
1185         ret = __memp_nameop(env,
1186             dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
1187
1188 err:    if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
1189                 ret = t_ret;
1190         if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
1191                 __os_free(env, real_old);
1192         if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
1193                 __os_free(env, real_new);
1194         return (ret);
1195 }
1196
1197 static int
1198 __fop_inmem_create(dbp, name, txn, flags)
1199         DB *dbp;
1200         const char *name;
1201         DB_TXN *txn;
1202         u_int32_t flags;
1203 {
1204         DBT fid_dbt, name_dbt;
1205         DB_LSN lsn;
1206         ENV *env;
1207         int ret;
1208         int32_t lfid;
1209         u_int32_t *p32;
1210
1211         env = dbp->env;
1212
1213         MAKE_INMEM(dbp);
1214
1215         /* Set the pagesize if it isn't yet set. */
1216         if (dbp->pgsize == 0)
1217                 dbp->pgsize = DB_DEF_IOSIZE;
1218
1219         /*
1220          * Construct a file_id.
1221          *
1222          * If this file has no name, then we only need a fileid for locking.
1223          * If this file has a name, we need the fileid both for locking and
1224          * matching in the memory pool.  So, with unnamed in-memory databases,
1225          * use a lock_id.  For named in-memory files, we need to find a value
1226          * that we can use to uniquely identify a name/fid pair.  We use a
1227          * combination of a unique id (__os_unique_id) and a hash of the
1228          * original name.
1229          */
1230         if (name == NULL) {
1231                 if (LOCKING_ON(env) && (ret =
1232                     __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
1233                         goto err;
1234         }  else {
1235                 p32 = (u_int32_t *)(&dbp->fileid[0]);
1236                 __os_unique_id(env, p32);
1237                 p32++;
1238                 (void)strncpy(
1239                     (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
1240                 dbp->preserve_fid = 1;
1241
1242                 if (DBENV_LOGGING(env) &&
1243 #if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
1244                     txn != NULL &&
1245 #endif
1246                     dbp->log_filename != NULL)
1247                         memcpy(dbp->log_filename->ufid,
1248                             dbp->fileid, DB_FILE_ID_LEN);
1249         }
1250
1251         /* Now, set the fileid. */
1252         if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
1253                 goto err;
1254
1255         if ((ret = __env_mpool(dbp, name, flags)) != 0)
1256                 goto err;
1257
1258         if (DBENV_LOGGING(env) &&
1259 #if !defined(DEBUG_WOP)
1260             txn != NULL &&
1261 #endif
1262             name != NULL) {
1263                 DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
1264                 memset(&fid_dbt, 0, sizeof(fid_dbt));
1265                 fid_dbt.data = dbp->fileid;
1266                 fid_dbt.size = DB_FILE_ID_LEN;
1267                 lfid = dbp->log_filename == NULL ?
1268                     DB_LOGFILEID_INVALID : dbp->log_filename->id;
1269                 if ((ret = __crdel_inmem_create_log(env, txn,
1270                     &lsn, 0, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
1271                         goto err;
1272         }
1273
1274         F_SET(dbp, DB_AM_CREATED);
1275
1276 err:
1277         return (ret);
1278 }
1279
1280 static int
1281 __fop_inmem_read_meta(dbp, txn, name, flags)
1282         DB *dbp;
1283         DB_TXN *txn;
1284         const char *name;
1285         u_int32_t flags;
1286 {
1287         DBMETA *metap;
1288         DB_THREAD_INFO *ip;
1289         db_pgno_t pgno;
1290         int ret, t_ret;
1291
1292         if (txn == NULL)
1293                 ENV_GET_THREAD_INFO(dbp->env, ip);
1294         else
1295                 ip = txn->thread_info;
1296
1297         pgno  = PGNO_BASE_MD;
1298         if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
1299                 return (ret);
1300         ret = __db_meta_setup(dbp->env, dbp, name, metap, flags, DB_CHK_META);
1301
1302         if ((t_ret =
1303             __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
1304                 ret = t_ret;
1305
1306         return (ret);
1307 }
1308
1309 static int
1310 __fop_ondisk_dummy(dbp, txn, name, mbuf)
1311         DB *dbp;
1312         DB_TXN *txn;
1313         const char *name;
1314         u_int8_t *mbuf;
1315 {
1316         ENV *env;
1317         int ret;
1318         char *realname;
1319         u_int32_t dflags;
1320
1321         realname = NULL;
1322         env = dbp->env;
1323         dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1324
1325         if ((ret = __db_appname(env,
1326             DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
1327                 goto err;
1328
1329         if ((ret = __fop_create(env,
1330             txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
1331                 goto err;
1332
1333         if ((ret =
1334             __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
1335                 goto err;
1336
1337         ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1338         if ((ret = __fop_write(env, txn, name, dbp->dirname,
1339             DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
1340                 goto err;
1341
1342         memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
1343
1344 err:    if (realname != NULL)
1345                 __os_free(env, realname);
1346
1347         return (ret);
1348 }
1349
1350 static int
1351 __fop_inmem_dummy(dbp, txn, name, mbuf)
1352         DB *dbp;
1353         DB_TXN *txn;
1354         const char *name;
1355         u_int8_t *mbuf;
1356 {
1357         DBMETA *metap;
1358         DB_THREAD_INFO *ip;
1359         db_pgno_t pgno;
1360         int ret, t_ret;
1361
1362         if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
1363                 return (ret);
1364         if (txn == NULL)
1365                 ENV_GET_THREAD_INFO(dbp->env, ip);
1366         else
1367                 ip = txn->thread_info;
1368
1369         pgno  = PGNO_BASE_MD;
1370         if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
1371             DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
1372                 return (ret);
1373         /* Check file existed. */
1374         if (metap->magic != 0)
1375                 ret = EEXIST;
1376         else
1377                 metap->magic = DB_RENAMEMAGIC;
1378
1379         /* Copy the fileid onto the meta-data page. */
1380         memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
1381
1382         if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
1383             ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
1384                 ret = t_ret;
1385
1386         if (ret != 0)
1387                 goto err;
1388
1389         ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1390
1391 err:    return (ret);
1392 }
1393
1394 static int
1395 __fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
1396         DB *dbp, *tmpdbp;
1397         DB_TXN *txn;
1398         const char *old, *new, *back;
1399         DB_LOCKER *locker;
1400 {
1401         DBT fiddbt, namedbt, tmpdbt;
1402         DB_FH *fhp;
1403         DB_LOCK elock;
1404         DB_LSN lsn;
1405         DB_TXN *parent;
1406         ENV *env;
1407         u_int8_t mbuf[DBMETASIZE];
1408         u_int32_t child_txnid, dflags;
1409         int ret, t_ret;
1410         char *realold, *realnew;
1411
1412         env = dbp->env;
1413         DB_ASSERT(env, txn != NULL);
1414         DB_ASSERT(env, old != NULL);
1415
1416         realold = realnew = NULL;
1417         LOCK_INIT(elock);
1418         fhp = NULL;
1419         dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1420
1421         if ((ret = __db_appname(env,
1422             DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
1423                 goto err;
1424
1425         /* Now, lock the name space while we initialize this file. */
1426 retry:  GET_ENVLOCK(env, locker, &elock);
1427         if (__os_exists(env, realnew, NULL) == 0) {
1428                 /*
1429                  * It is possible that the only reason this file exists is
1430                  * because we've done a previous rename of it and we have
1431                  * left a placeholder here.  We need to check for that case
1432                  * and allow this rename to succeed if that's the case.
1433                  */
1434                 if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
1435                         goto err;
1436                 if ((ret = __fop_read_meta(env,
1437                     realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
1438                     (ret = __db_meta_setup(env,
1439                     tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
1440                         ret = EEXIST;
1441                         goto err;
1442                 }
1443                 ret = __os_closehandle(env, fhp);
1444                 fhp = NULL;
1445                 if (ret != 0)
1446                         goto err;
1447
1448                 /*
1449                  * Now, try to acquire the handle lock.  If the handle is locked
1450                  * by our current, transaction, then we'll get it and life is
1451                  * good.
1452                  *
1453                  * Alternately, it's not locked at all, we'll get the lock, but
1454                  * we will realize it exists and consider this an error.
1455                  *
1456                  * However, if it's held by another transaction, then there
1457                  * could be two different scenarios: 1) the file is in the
1458                  * midst of being created or deleted and when that transaction
1459                  * is over, we might be able to proceed. 2) the file is open
1460                  * and exists and we should report an error. In order to
1461                  * distinguish these two cases, we do the following. First, we
1462                  * try to acquire a READLOCK.  If the handle is in the midst of
1463                  * being created, then we'll block because a writelock is held.
1464                  * In that case, we should request a blocking write, and when we
1465                  * get the lock, we should then go back and check to see if the
1466                  * object exists and start all over again.
1467                  *
1468                  * If we got the READLOCK, then either no one is holding the
1469                  * lock or someone has an open handle and the fact that the file
1470                  * exists is problematic.  So, in this case, we request the
1471                  * WRITELOCK non-blocking -- if it succeeds, we're golden.  If
1472                  * it fails, then the file exists and we return EEXIST.
1473                  */
1474                 if ((ret = __fop_lock_handle(env,
1475                     tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1476                         /*
1477                          * Someone holds a write-lock.  Wait for the write-lock
1478                          * and after we get it, release it and start over.
1479                          */
1480                         if ((ret = __fop_lock_handle(env, tmpdbp,
1481                             locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1482                                 goto err;
1483                         if ((ret =
1484                             __lock_put(env, &tmpdbp->handle_lock)) != 0)
1485                                 goto err;
1486                         if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
1487                                 goto err;
1488                         goto retry;
1489                 }
1490
1491                 /* We got the read lock; try to upgrade it. */
1492                 ret = __fop_lock_handle(env,
1493                     tmpdbp, locker, DB_LOCK_WRITE,
1494                     NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
1495                 if (ret != 0) {
1496                         /*
1497                          * We did not get the writelock, so someone
1498                          * has the handle open.  This is an error.
1499                          */
1500                         (void)__lock_put(env, &tmpdbp->handle_lock);
1501                         ret = EEXIST;
1502                 } else  if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1503                         /* We got the lock and are renaming it. */
1504                         ret = 0;
1505                 else { /* We got the lock, but the file exists. */
1506                         (void)__lock_put(env, &tmpdbp->handle_lock);
1507                         ret = EEXIST;
1508                 }
1509                 if (ret != 0)
1510                         goto err;
1511         }
1512
1513         /*
1514          * While we have the namespace locked, do the renames and then
1515          * swap for the handle lock.
1516          */
1517         if ((ret = __fop_rename(env, txn,
1518             old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
1519                 goto err;
1520         if ((ret = __fop_rename(env, txn, back, old,
1521             &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
1522                 goto err;
1523         if ((ret = __fop_lock_handle(env,
1524             tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
1525                 goto err;
1526
1527         /*
1528          * We just acquired a transactional lock on the tmp handle.
1529          * We need to null out the tmp handle's lock so that it
1530          * doesn't create problems for us in the close path.
1531          */
1532         LOCK_INIT(tmpdbp->handle_lock);
1533
1534         /* Commit the child. */
1535         child_txnid = txn->txnid;
1536         parent = txn->parent;
1537         ret = __txn_commit(txn, 0);
1538         txn = NULL;
1539
1540         /*
1541          * If the new name is available because it was previously renamed
1542          * remove it from the remove list.
1543          */
1544         if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1545                 __txn_remrem(env, parent, realnew);
1546
1547         /* Now log the child information in the parent. */
1548         memset(&fiddbt, 0, sizeof(fiddbt));
1549         fiddbt.data = dbp->fileid;
1550         fiddbt.size = DB_FILE_ID_LEN;
1551         memset(&tmpdbt, 0, sizeof(fiddbt));
1552         tmpdbt.data = tmpdbp->fileid;
1553         tmpdbt.size = DB_FILE_ID_LEN;
1554         DB_INIT_DBT(namedbt, old, strlen(old) + 1);
1555         if ((t_ret = __fop_file_remove_log(env,
1556             parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
1557             (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
1558                 ret = t_ret;
1559
1560         /* This is a delayed delete of the dummy file. */
1561         if ((ret = __db_appname(env,
1562             DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
1563                 goto err;
1564
1565         if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
1566                 goto err;
1567
1568 err:    if (txn != NULL)        /* Ret must already be set, so void abort. */
1569                 (void)__txn_abort(txn);
1570
1571         (void)__ENV_LPUT(env, elock);
1572
1573         if (fhp != NULL &&
1574             (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
1575                 ret = t_ret;
1576
1577         if (realnew != NULL)
1578                 __os_free(env, realnew);
1579         if (realold != NULL)
1580                 __os_free(env, realold);
1581         return (ret);
1582 }
1583
1584 static int
1585 __fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
1586         DB *olddbp, *backdbp;
1587         DB_TXN *txn;
1588         const char *old, *new, *back;
1589         DB_LOCKER *locker;
1590 {
1591         DB *tmpdbp;
1592         DBT fid_dbt, n1_dbt, n2_dbt;
1593         DB_LOCK elock;
1594         DB_LSN lsn;
1595         DB_TXN *parent;
1596         ENV *env;
1597         int ret, t_ret;
1598
1599         env = olddbp->env;
1600         parent = txn->parent;
1601 retry:  LOCK_INIT(elock);
1602         if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1603                 return (ret);
1604         MAKE_INMEM(tmpdbp);
1605
1606         GET_ENVLOCK(env, locker, &elock);
1607         if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
1608                 /*
1609                  * It is possible that the only reason this database exists is
1610                  * because we've done a previous rename of it and we have
1611                  * left a placeholder here.  We need to check for that case
1612                  * and allow this rename to succeed if that's the case.
1613                  */
1614
1615                 if ((ret = __fop_inmem_read_meta(tmpdbp, txn, new, 0)) != 0) {
1616                         ret = EEXIST;
1617                         goto err;
1618                 }
1619
1620                 /*
1621                  * Now, try to acquire the handle lock.  If it's from our txn,
1622                  * then we'll get the lock.  If it's not, then someone else has
1623                  * it locked.  See the comments in __fop_ondisk_swap for
1624                  * details.
1625                  */
1626                 if ((ret = __fop_lock_handle(env,
1627                     tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1628                         /*
1629                          * Someone holds a writelock.  Try for the WRITELOCK
1630                          * and after we get it, retry.
1631                          */
1632                         if ((ret = __fop_lock_handle(env, tmpdbp,
1633                             locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1634                                 goto err;
1635
1636                         /* We have the write lock; release it and start over. */
1637                         (void)__lock_put(env, &tmpdbp->handle_lock);
1638                         (void)__db_close(tmpdbp, NULL, DB_NOSYNC);
1639                         (void)__ENV_LPUT(env, elock);
1640                         goto retry;
1641                 } else {
1642                         (void)__lock_put(env, &tmpdbp->handle_lock);
1643                         if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1644                                 ret = EEXIST;
1645                 }
1646                 if (ret != 0)
1647                         goto err;
1648         }
1649
1650         /* Log the renames. */
1651         if (LOGGING_ON(env)
1652 #ifndef DEBUG_WOP
1653             && txn != NULL
1654 #endif
1655         ) {
1656                 /* Rename old to new. */
1657                 DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
1658                 DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
1659                 DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
1660                 if ((ret = __crdel_inmem_rename_log(
1661                     env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
1662                         goto err;
1663
1664                 /* Rename back to old */
1665                 fid_dbt.data = backdbp->fileid;
1666                 DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
1667                 if ((ret = __crdel_inmem_rename_log(
1668                     env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
1669                         goto err;
1670         }
1671
1672         /*
1673          * While we have the namespace locked, do the renames and then
1674          * swap for the handle lock.   If we ran into a file in the midst
1675          * of rename, then we need to delete it first, else nameop is
1676          * going to consider it an error.
1677          */
1678         if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
1679                 if ((ret = __memp_nameop(env,
1680                     tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
1681                         goto err;
1682                 __txn_remrem(env, parent, new);
1683         }
1684
1685         if ((ret = __memp_nameop(
1686             env, olddbp->fileid, new, old, new, 1)) != 0)
1687                 goto err;
1688         if ((ret = __memp_nameop(
1689             env, backdbp->fileid, old, back, old, 1)) != 0)
1690                 goto err;
1691
1692         if ((ret = __fop_lock_handle(env,
1693             tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1694                 goto err;
1695
1696         /*
1697          * We just acquired a transactional lock on the tmp handle.
1698          * We need to null out the tmp handle's lock so that it
1699          * doesn't create problems for us in the close path.
1700          */
1701         LOCK_INIT(tmpdbp->handle_lock);
1702
1703         DB_ASSERT(env, txn != NULL);
1704
1705         /* Commit the child. */
1706         ret = __txn_commit(txn, 0);
1707         txn = NULL;
1708
1709         if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
1710                 goto err;
1711
1712 err:    (void)__ENV_LPUT(env, elock);
1713
1714         if (txn != NULL)
1715                 (void)__txn_abort(txn);
1716
1717         if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1718                 ret = t_ret;
1719
1720         return (ret);
1721 }