2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 2001-2009 Oracle. All rights reserved.
12 #include "dbinc/db_page.h"
13 #include "dbinc/fop.h"
14 #include "dbinc/log.h"
16 #include "dbinc/txn.h"
17 #include "dbinc/db_am.h"
20 * The transactional guarantees Berkeley DB provides for file
21 * system level operations (database physical file create, delete,
22 * rename) are based on our understanding of current file system
23 * semantics; a system that does not provide these semantics and
24 * guarantees could be in danger.
26 * First, as in standard database changes, fsync and fdatasync must
27 * work: when applied to the log file, the records written into the
28 * log must be transferred to stable storage.
30 * Second, it must not be possible for the log file to be removed
31 * without previous file system level operations being flushed to
32 * stable storage. Berkeley DB applications write log records
33 * describing file system operations into the log, then perform the
34 * file system operation, then commit the enclosing transaction
35 * (which flushes the log file to stable storage). Subsequently,
36 * a database environment checkpoint may make it possible for the
37 * application to remove the log file containing the record of the
38 * file system operation. DB's transactional guarantees for file
39 * system operations require the log file removal not succeed until
40 * all previous filesystem operations have been flushed to stable
41 * storage. In other words, the flush of the log file, or the
42 * removal of the log file, must block until all previous
43 * filesystem operations have been flushed to stable storage. This
44 * semantic is not, as far as we know, required by any existing
45 * standards document, but we have never seen a filesystem where
51 * Create a (transactionally protected) file system object. This is used
52 * to create DB files now, potentially blobs, queue extents and anything
53 * else you wish to store in a file system object.
55 * PUBLIC: int __fop_create __P((ENV *, DB_TXN *,
56 * PUBLIC: DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
59 __fop_create(env, txn, fhpp, name, dirp, appname, mode, flags)
63 const char *name, **dirp;
77 if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
83 if (DBENV_LOGGING(env)
84 #if !defined(DEBUG_WOP)
88 DB_INIT_DBT(data, name, strlen(name) + 1);
89 if (dirp != NULL && *dirp != NULL)
90 DB_INIT_DBT(dirdata, *dirp, strlen(*dirp) + 1);
92 memset(&dirdata, 0, sizeof(dirdata));
93 if ((ret = __fop_create_log(env, txn, &lsn,
95 &data, &dirdata, (u_int32_t)appname, (u_int32_t)mode)) != 0)
99 DB_ENV_TEST_RECOVERY(env, DB_TEST_POSTLOG, ret, name);
104 env, real_name, 0, DB_OSO_CREATE | DB_OSO_EXCL, mode, fhpp);
107 DB_TEST_RECOVERY_LABEL
108 if (fhpp == &fhp && fhp != NULL)
109 (void)__os_closehandle(env, fhp);
110 if (real_name != NULL)
111 __os_free(env, real_name);
117 * Remove a file system object.
119 * PUBLIC: int __fop_remove __P((ENV *, DB_TXN *,
120 * PUBLIC: u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
123 __fop_remove(env, txn, fileid, name, dirp, appname, flags)
127 const char *name, **dirp;
138 if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
141 if (!IS_REAL_TXN(txn)) {
142 if (fileid != NULL && (ret = __memp_nameop(
143 env, fileid, NULL, real_name, NULL, 0)) != 0)
146 if (DBENV_LOGGING(env)
147 #if !defined(DEBUG_WOP)
151 memset(&fdbt, 0, sizeof(ndbt));
153 fdbt.size = fileid == NULL ? 0 : DB_FILE_ID_LEN;
154 DB_INIT_DBT(ndbt, name, strlen(name) + 1);
155 if ((ret = __fop_remove_log(env, txn, &lsn,
156 flags, &ndbt, &fdbt, (u_int32_t)appname)) != 0)
159 ret = __txn_remevent(env, txn, real_name, fileid, 0);
162 err: if (real_name != NULL)
163 __os_free(env, real_name);
170 * Write "size" bytes from "buf" to file "name" beginning at offset "off."
171 * If the file is open, supply a handle in fhp. Istmp indicate if this is
172 * an operation that needs to be undone in the face of failure (i.e., if
173 * this is a write to a temporary file, we're simply going to remove the
174 * file, so don't worry about undoing the write).
176 * Currently, we *only* use this with istmp true. If we need more general
177 * handling, then we'll have to zero out regions on abort (and possibly
178 * log the before image of the data in the log record).
180 * PUBLIC: int __fop_write __P((ENV *, DB_TXN *,
181 * PUBLIC: const char *, const char *, APPNAME, DB_FH *, u_int32_t,
182 * PUBLIC: db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
185 __fop_write(env, txn,
186 name, dirname, appname, fhp, pgsize, pageno, off, buf, size, istmp, flags)
189 const char *name, *dirname;
196 u_int32_t size, istmp, flags;
198 DBT data, namedbt, dirdbt;
201 int local_open, ret, t_ret;
204 DB_ASSERT(env, istmp != 0);
206 ret = local_open = 0;
209 if (DBENV_LOGGING(env)
210 #if !defined(DEBUG_WOP)
214 memset(&data, 0, sizeof(data));
217 DB_INIT_DBT(namedbt, name, strlen(name) + 1);
219 DB_INIT_DBT(dirdbt, dirname, strlen(dirname) + 1);
221 memset(&dirdbt, 0, sizeof(dirdbt));
222 if ((ret = __fop_write_log(env, txn,
223 &lsn, flags, &namedbt, &dirdbt, (u_int32_t)appname,
224 pgsize, pageno, off, &data, istmp)) != 0)
229 /* File isn't open; we need to reopen it. */
230 if ((ret = __db_appname(env,
231 appname, name, &dirname, &real_name)) != 0)
234 if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0)
239 /* Seek to offset. */
240 if ((ret = __os_seek(env, fhp, pageno, pgsize, off)) != 0)
243 /* Now do the write. */
244 if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0)
247 err: if (local_open &&
248 (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
251 if (real_name != NULL)
252 __os_free(env, real_name);
258 * Change a file's name.
260 * PUBLIC: int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *,
261 * PUBLIC: const char **, u_int8_t *, APPNAME, int, u_int32_t));
264 __fop_rename(env, txn, oldname, newname, dirp, fid, appname, with_undo, flags)
275 DBT fiddbt, dir, new, old;
281 if ((ret = __db_appname(env, appname, oldname, dirp, &o)) != 0)
283 if ((ret = __db_appname(env, appname, newname, dirp, &n)) != 0)
286 if (DBENV_LOGGING(env)
287 #if !defined(DEBUG_WOP)
291 DB_INIT_DBT(old, oldname, strlen(oldname) + 1);
292 DB_INIT_DBT(new, newname, strlen(newname) + 1);
293 if (dirp != NULL && *dirp != NULL)
294 DB_INIT_DBT(dir, *dirp, strlen(*dirp) + 1);
296 memset(&dir, 0, sizeof(dir));
297 memset(&fiddbt, 0, sizeof(fiddbt));
299 fiddbt.size = DB_FILE_ID_LEN;
301 ret = __fop_rename_log(env,
302 txn, &lsn, flags | DB_FLUSH,
303 &old, &new, &dir, &fiddbt, (u_int32_t)appname);
305 ret = __fop_rename_noundo_log(env,
306 txn, &lsn, flags | DB_FLUSH,
307 &old, &new, &dir, &fiddbt, (u_int32_t)appname);
312 ret = __memp_nameop(env, fid, newname, o, n, 0);