2 * @brief Lightning memory-mapped database library
4 * A Btree-based database management library modeled loosely on the
5 * BerkeleyDB API, but much simplified.
8 * Copyright 2011-2014 Howard Chu, Symas Corp.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted only as authorized by the OpenLDAP
15 * A copy of this license is available in the file LICENSE in the
16 * top-level directory of the distribution or, alternatively, at
17 * <http://www.OpenLDAP.org/license.html>.
19 * This code is derived from btree.c written by Martin Hedenfalk.
21 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
23 * Permission to use, copy, modify, and distribute this software for any
24 * purpose with or without fee is hereby granted, provided that the above
25 * copyright notice and this permission notice appear in all copies.
27 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
28 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
30 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
31 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
32 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
33 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
41 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
42 * as int64 which is wrong. MSVC doesn't define it at all, so just
46 #define MDB_THR_T DWORD
47 #include <sys/types.h>
50 # include <sys/param.h>
52 # define LITTLE_ENDIAN 1234
53 # define BIG_ENDIAN 4321
54 # define BYTE_ORDER LITTLE_ENDIAN
56 # define SSIZE_MAX INT_MAX
60 #include <sys/types.h>
62 #define MDB_PID_T pid_t
63 #define MDB_THR_T pthread_t
64 #include <sys/param.h>
67 #ifdef HAVE_SYS_FILE_H
73 #if defined(__mips) && defined(__linux)
74 /* MIPS has cache coherency issues, requires explicit cache control */
75 #include <asm/cachectl.h>
76 extern int cacheflush(char *addr, int nbytes, int cache);
77 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
79 #define CACHEFLUSH(addr, bytes, cache)
93 #if defined(__sun) || defined(ANDROID)
94 /* Most platforms have posix_memalign, older may only have memalign */
95 #define HAVE_MEMALIGN 1
99 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
100 #include <netinet/in.h>
101 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
104 #if defined(__APPLE__) || defined (BSD)
105 # define MDB_USE_SYSV_SEM 1
106 # define MDB_FDATASYNC fsync
107 #elif defined(ANDROID)
108 # define MDB_FDATASYNC fsync
113 #ifdef MDB_USE_SYSV_SEM
116 #ifdef _SEM_SEMUN_UNDEFINED
119 struct semid_ds *buf;
120 unsigned short *array;
122 #endif /* _SEM_SEMUN_UNDEFINED */
123 #endif /* MDB_USE_SYSV_SEM */
127 #include <valgrind/memcheck.h>
128 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
129 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
130 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
131 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
132 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
134 #define VGMEMP_CREATE(h,r,z)
135 #define VGMEMP_ALLOC(h,a,s)
136 #define VGMEMP_FREE(h,a)
137 #define VGMEMP_DESTROY(h)
138 #define VGMEMP_DEFINED(a,s)
142 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
143 /* Solaris just defines one or the other */
144 # define LITTLE_ENDIAN 1234
145 # define BIG_ENDIAN 4321
146 # ifdef _LITTLE_ENDIAN
147 # define BYTE_ORDER LITTLE_ENDIAN
149 # define BYTE_ORDER BIG_ENDIAN
152 # define BYTE_ORDER __BYTE_ORDER
156 #ifndef LITTLE_ENDIAN
157 #define LITTLE_ENDIAN __LITTLE_ENDIAN
160 #define BIG_ENDIAN __BIG_ENDIAN
163 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
164 #define MISALIGNED_OK 1
170 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
171 # error "Unknown or unsupported endianness (BYTE_ORDER)"
172 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
173 # error "Two's complement, reasonably sized integer types, please"
177 /** Put infrequently used env functions in separate section */
179 # define ESECT __attribute__ ((section("__TEXT,text_env")))
181 # define ESECT __attribute__ ((section("text_env")))
187 /** @defgroup internal LMDB Internals
190 /** @defgroup compat Compatibility Macros
191 * A bunch of macros to minimize the amount of platform-specific ifdefs
192 * needed throughout the rest of the code. When the features this library
193 * needs are similar enough to POSIX to be hidden in a one-or-two line
194 * replacement, this macro approach is used.
198 /** Features under development */
203 #if defined(_WIN32) || (defined(EOWNERDEAD) && !defined(MDB_USE_SYSV_SEM))
204 #define MDB_ROBUST_SUPPORTED 1
207 /** Wrapper around __func__, which is a C99 feature */
208 #if __STDC_VERSION__ >= 199901L
209 # define mdb_func_ __func__
210 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
211 # define mdb_func_ __FUNCTION__
213 /* If a debug message says <mdb_unknown>(), update the #if statements above */
214 # define mdb_func_ "<mdb_unknown>"
218 #define MDB_USE_HASH 1
219 #define MDB_PIDLOCK 0
220 #define THREAD_RET DWORD
221 #define pthread_t HANDLE
222 #define pthread_mutex_t HANDLE
223 #define pthread_cond_t HANDLE
224 typedef HANDLE mdb_mutex_t;
225 #define pthread_key_t DWORD
226 #define pthread_self() GetCurrentThreadId()
227 #define pthread_key_create(x,y) \
228 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
229 #define pthread_key_delete(x) TlsFree(x)
230 #define pthread_getspecific(x) TlsGetValue(x)
231 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
232 #define pthread_mutex_consistent(mutex) 0
233 #define pthread_mutex_unlock(x) ReleaseMutex(*x)
234 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
235 #define pthread_cond_signal(x) SetEvent(*x)
236 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
237 #define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL)
238 #define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE)
239 #define MDB_MUTEX(env, rw) ((env)->me_##rw##mutex)
240 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE)
241 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex)
242 #define getpid() GetCurrentProcessId()
243 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
244 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
245 #define ErrCode() GetLastError()
246 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
247 #define close(fd) (CloseHandle(fd) ? 0 : -1)
248 #define munmap(ptr,len) UnmapViewOfFile(ptr)
249 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
250 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
252 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
256 #define THREAD_RET void *
257 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
258 #define THREAD_FINISH(thr) pthread_join(thr,NULL)
259 #define Z "z" /**< printf format modifier for size_t */
261 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
262 #define MDB_PIDLOCK 1
264 #ifdef MDB_USE_SYSV_SEM
266 typedef struct mdb_mutex {
271 #define MDB_MUTEX(env, rw) (&(env)->me_##rw##mutex)
272 #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex)
273 #define UNLOCK_MUTEX(mutex) do { \
274 struct sembuf sb = { 0, 1, SEM_UNDO }; \
275 sb.sem_num = (mutex)->semnum; \
276 semop((mutex)->semid, &sb, 1); \
280 mdb_sem_wait(mdb_mutex_t *sem)
283 struct sembuf sb = { 0, -1, SEM_UNDO };
284 sb.sem_num = sem->semnum;
285 while ((rc = semop(sem->semid, &sb, 1)) && (rc = errno) == EINTR) ;
290 /** Pointer/HANDLE type of shared mutex/semaphore.
292 typedef pthread_mutex_t mdb_mutex_t;
293 /** Mutex for the reader table (rw = r) or write transaction (rw = w).
295 #define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex)
296 /** Lock the reader or writer mutex.
297 * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX().
299 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex)
300 /** Unlock the reader or writer mutex.
302 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex)
303 #endif /* MDB_USE_SYSV_SEM */
305 /** Get the error code for the last failed system function.
307 #define ErrCode() errno
309 /** An abstraction for a file handle.
310 * On POSIX systems file handles are small integers. On Windows
311 * they're opaque pointers.
315 /** A value for an invalid file handle.
316 * Mainly used to initialize file variables and signify that they are
319 #define INVALID_HANDLE_VALUE (-1)
321 /** Get the size of a memory page for the system.
322 * This is the basic size that the platform's memory manager uses, and is
323 * fundamental to the use of memory-mapped files.
325 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
330 #elif defined(MDB_USE_SYSV_SEM)
333 #define MNAME_LEN (sizeof(pthread_mutex_t))
338 #ifdef MDB_ROBUST_SUPPORTED
339 /** Lock mutex, handle any error, set rc = result.
340 * Return 0 on success, nonzero (not rc) on error.
342 #define LOCK_MUTEX(rc, env, mutex) \
343 (((rc) = LOCK_MUTEX0(mutex)) && \
344 ((rc) = mdb_mutex_failed(env, mutex, rc)))
345 static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc);
347 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
348 #define mdb_mutex_failed(env, mutex, rc) (rc)
352 /** A flag for opening a file and requesting synchronous data writes.
353 * This is only used when writing a meta page. It's not strictly needed;
354 * we could just do a normal write and then immediately perform a flush.
355 * But if this flag is available it saves us an extra system call.
357 * @note If O_DSYNC is undefined but exists in /usr/include,
358 * preferably set some compiler flag to get the definition.
359 * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
362 # define MDB_DSYNC O_DSYNC
366 /** Function for flushing the data of a file. Define this to fsync
367 * if fdatasync() is not supported.
369 #ifndef MDB_FDATASYNC
370 # define MDB_FDATASYNC fdatasync
371 # define HAVE_FDATASYNC 1
375 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags)
386 /** A page number in the database.
387 * Note that 64 bit page numbers are overkill, since pages themselves
388 * already represent 12-13 bits of addressable memory, and the OS will
389 * always limit applications to a maximum of 63 bits of address space.
391 * @note In the #MDB_node structure, we only store 48 bits of this value,
392 * which thus limits us to only 60 bits of addressable data.
394 typedef MDB_ID pgno_t;
396 /** A transaction ID.
397 * See struct MDB_txn.mt_txnid for details.
399 typedef MDB_ID txnid_t;
401 /** @defgroup debug Debug Macros
405 /** Enable debug output. Needs variable argument macros (a C99 feature).
406 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
407 * read from and written to the database (used for free space management).
413 static int mdb_debug;
414 static txnid_t mdb_debug_start;
416 /** Print a debug message with printf formatting.
417 * Requires double parenthesis around 2 or more args.
419 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args))
420 # define DPRINTF0(fmt, ...) \
421 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__)
423 # define DPRINTF(args) ((void) 0)
425 /** Print a debug string.
426 * The string is printed literally, with no format processing.
428 #define DPUTS(arg) DPRINTF(("%s", arg))
429 /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
431 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
434 /** @brief The maximum size of a database page.
436 * It is 32k or 64k, since value-PAGEBASE must fit in
437 * #MDB_page.%mp_upper.
439 * LMDB will use database pages < OS pages if needed.
440 * That causes more I/O in write transactions: The OS must
441 * know (read) the whole page before writing a partial page.
443 * Note that we don't currently support Huge pages. On Linux,
444 * regular data files cannot use Huge pages, and in general
445 * Huge pages aren't actually pageable. We rely on the OS
446 * demand-pager to read our data and page it out when memory
447 * pressure from other processes is high. So until OSs have
448 * actual paging support for Huge pages, they're not viable.
450 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
452 /** The minimum number of keys required in a database page.
453 * Setting this to a larger value will place a smaller bound on the
454 * maximum size of a data item. Data items larger than this size will
455 * be pushed into overflow pages instead of being stored directly in
456 * the B-tree node. This value used to default to 4. With a page size
457 * of 4096 bytes that meant that any item larger than 1024 bytes would
458 * go into an overflow page. That also meant that on average 2-3KB of
459 * each overflow page was wasted space. The value cannot be lower than
460 * 2 because then there would no longer be a tree structure. With this
461 * value, items larger than 2KB will go into overflow pages, and on
462 * average only 1KB will be wasted.
464 #define MDB_MINKEYS 2
466 /** A stamp that identifies a file as an LMDB file.
467 * There's nothing special about this value other than that it is easily
468 * recognizable, and it will reflect any byte order mismatches.
470 #define MDB_MAGIC 0xBEEFC0DE
472 /** The version number for a database's datafile format. */
473 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
474 /** The version number for a database's lockfile format. */
475 #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1)
477 /** @brief The max size of a key we can write, or 0 for dynamic max.
479 * Define this as 0 to compute the max from the page size. 511
480 * is default for backwards compat: liblmdb <= 0.9.10 can break
481 * when modifying a DB with keys/dupsort data bigger than its max.
482 * #MDB_DEVEL sets the default to 0.
484 * Data items in an #MDB_DUPSORT database are also limited to
485 * this size, since they're actually keys of a sub-DB. Keys and
486 * #MDB_DUPSORT data items must fit on a node in a regular page.
488 #ifndef MDB_MAXKEYSIZE
489 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
492 /** The maximum size of a key we can write to the environment. */
494 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE)
496 #define ENV_MAXKEY(env) ((env)->me_maxkey)
499 /** @brief The maximum size of a data item.
501 * We only store a 32 bit value for node sizes.
503 #define MAXDATASIZE 0xffffffffUL
506 /** Key size which fits in a #DKBUF.
509 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511)
512 * This is used for printing a hex dump of a key's contents.
514 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1]
515 /** Display a key in hex.
517 * Invoke a function to display a key in hex.
519 #define DKEY(x) mdb_dkey(x, kbuf)
525 /** An invalid page number.
526 * Mainly used to denote an empty tree.
528 #define P_INVALID (~(pgno_t)0)
530 /** Test if the flags \b f are set in a flag word \b w. */
531 #define F_ISSET(w, f) (((w) & (f)) == (f))
533 /** Round \b n up to an even number. */
534 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
536 /** Used for offsets within a single page.
537 * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
540 typedef uint16_t indx_t;
542 /** Default size of memory map.
543 * This is certainly too small for any actual applications. Apps should always set
544 * the size explicitly using #mdb_env_set_mapsize().
546 #define DEFAULT_MAPSIZE 1048576
548 /** @defgroup readers Reader Lock Table
549 * Readers don't acquire any locks for their data access. Instead, they
550 * simply record their transaction ID in the reader table. The reader
551 * mutex is needed just to find an empty slot in the reader table. The
552 * slot's address is saved in thread-specific data so that subsequent read
553 * transactions started by the same thread need no further locking to proceed.
555 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
557 * No reader table is used if the database is on a read-only filesystem, or
558 * if #MDB_NOLOCK is set.
560 * Since the database uses multi-version concurrency control, readers don't
561 * actually need any locking. This table is used to keep track of which
562 * readers are using data from which old transactions, so that we'll know
563 * when a particular old transaction is no longer in use. Old transactions
564 * that have discarded any data pages can then have those pages reclaimed
565 * for use by a later write transaction.
567 * The lock table is constructed such that reader slots are aligned with the
568 * processor's cache line size. Any slot is only ever used by one thread.
569 * This alignment guarantees that there will be no contention or cache
570 * thrashing as threads update their own slot info, and also eliminates
571 * any need for locking when accessing a slot.
573 * A writer thread will scan every slot in the table to determine the oldest
574 * outstanding reader transaction. Any freed pages older than this will be
575 * reclaimed by the writer. The writer doesn't use any locks when scanning
576 * this table. This means that there's no guarantee that the writer will
577 * see the most up-to-date reader info, but that's not required for correct
578 * operation - all we need is to know the upper bound on the oldest reader,
579 * we don't care at all about the newest reader. So the only consequence of
580 * reading stale information here is that old pages might hang around a
581 * while longer before being reclaimed. That's actually good anyway, because
582 * the longer we delay reclaiming old pages, the more likely it is that a
583 * string of contiguous pages can be found after coalescing old pages from
584 * many old transactions together.
587 /** Number of slots in the reader table.
588 * This value was chosen somewhat arbitrarily. 126 readers plus a
589 * couple mutexes fit exactly into 8KB on my development machine.
590 * Applications should set the table size using #mdb_env_set_maxreaders().
592 #define DEFAULT_READERS 126
594 /** The size of a CPU cache line in bytes. We want our lock structures
595 * aligned to this size to avoid false cache line sharing in the
597 * This value works for most CPUs. For Itanium this should be 128.
603 /** The information we store in a single slot of the reader table.
604 * In addition to a transaction ID, we also record the process and
605 * thread ID that owns a slot, so that we can detect stale information,
606 * e.g. threads or processes that went away without cleaning up.
607 * @note We currently don't check for stale records. We simply re-init
608 * the table when we know that we're the only process opening the
611 typedef struct MDB_rxbody {
612 /** Current Transaction ID when this transaction began, or (txnid_t)-1.
613 * Multiple readers that start at the same time will probably have the
614 * same ID here. Again, it's not important to exclude them from
615 * anything; all we need to know is which version of the DB they
616 * started from so we can avoid overwriting any data used in that
617 * particular version.
619 volatile txnid_t mrb_txnid;
620 /** The process ID of the process owning this reader txn. */
621 volatile MDB_PID_T mrb_pid;
622 /** The thread ID of the thread owning this txn. */
623 volatile MDB_THR_T mrb_tid;
626 /** The actual reader record, with cacheline padding. */
627 typedef struct MDB_reader {
630 /** shorthand for mrb_txnid */
631 #define mr_txnid mru.mrx.mrb_txnid
632 #define mr_pid mru.mrx.mrb_pid
633 #define mr_tid mru.mrx.mrb_tid
634 /** cache line alignment */
635 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
639 /** The header for the reader table.
640 * The table resides in a memory-mapped file. (This is a different file
641 * than is used for the main database.)
643 * For POSIX the actual mutexes reside in the shared memory of this
644 * mapped file. On Windows, mutexes are named objects allocated by the
645 * kernel; we store the mutex names in this mapped file so that other
646 * processes can grab them. This same approach is also used on
647 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
648 * process-shared POSIX mutexes. For these cases where a named object
649 * is used, the object name is derived from a 64 bit FNV hash of the
650 * environment pathname. As such, naming collisions are extremely
651 * unlikely. If a collision occurs, the results are unpredictable.
653 typedef struct MDB_txbody {
654 /** Stamp identifying this as an LMDB file. It must be set
657 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
660 char mtb_rmname[MNAME_LEN];
661 #elif defined(MDB_USE_SYSV_SEM)
664 /** Mutex protecting access to this table.
665 * This is the #MDB_MUTEX(env,r) reader table lock.
667 pthread_mutex_t mtb_rmutex;
669 /** The ID of the last transaction committed to the database.
670 * This is recorded here only for convenience; the value can always
671 * be determined by reading the main database meta pages.
673 volatile txnid_t mtb_txnid;
674 /** The number of slots that have been used in the reader table.
675 * This always records the maximum count, it is not decremented
676 * when readers release their slots.
678 volatile unsigned mtb_numreaders;
681 /** The actual reader table definition. */
682 typedef struct MDB_txninfo {
685 #define mti_magic mt1.mtb.mtb_magic
686 #define mti_format mt1.mtb.mtb_format
687 #define mti_rmutex mt1.mtb.mtb_rmutex
688 #define mti_rmname mt1.mtb.mtb_rmname
689 #define mti_txnid mt1.mtb.mtb_txnid
690 #define mti_numreaders mt1.mtb.mtb_numreaders
691 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
693 #ifdef MDB_USE_SYSV_SEM
694 #define mti_semid mt1.mtb.mtb_semid
698 char mt2_wmname[MNAME_LEN];
699 #define mti_wmname mt2.mt2_wmname
701 pthread_mutex_t mt2_wmutex;
702 #define mti_wmutex mt2.mt2_wmutex
704 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
707 MDB_reader mti_readers[1];
710 /** Lockfile format signature: version, features and field layout */
711 #define MDB_LOCK_FORMAT \
713 ((MDB_LOCK_VERSION) \
714 /* Flags which describe functionality */ \
715 + (((MNAME_LEN) == 0) << 18) /* MDB_USE_SYSV_SEM */ \
716 + (((MDB_PIDLOCK) != 0) << 16)))
719 /** Common header for all page types.
720 * Overflow records occupy a number of contiguous pages with no
721 * headers on any page after the first.
723 typedef struct MDB_page {
724 #define mp_pgno mp_p.p_pgno
725 #define mp_next mp_p.p_next
727 pgno_t p_pgno; /**< page number */
728 struct MDB_page *p_next; /**< for in-memory list of freed pages */
731 /** @defgroup mdb_page Page Flags
733 * Flags for the page headers.
736 #define P_BRANCH 0x01 /**< branch page */
737 #define P_LEAF 0x02 /**< leaf page */
738 #define P_OVERFLOW 0x04 /**< overflow page */
739 #define P_META 0x08 /**< meta page */
740 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
741 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
742 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
743 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
744 #define P_KEEP 0x8000 /**< leave this page alone during spill */
746 uint16_t mp_flags; /**< @ref mdb_page */
747 #define mp_lower mp_pb.pb.pb_lower
748 #define mp_upper mp_pb.pb.pb_upper
749 #define mp_pages mp_pb.pb_pages
752 indx_t pb_lower; /**< lower bound of free space */
753 indx_t pb_upper; /**< upper bound of free space */
755 uint32_t pb_pages; /**< number of overflow pages */
757 indx_t mp_ptrs[1]; /**< dynamic size */
760 /** Size of the page header, excluding dynamic data at the end */
761 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs))
763 /** Address of first usable data byte in a page, after the header */
764 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
766 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
767 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
769 /** Number of nodes on a page */
770 #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
772 /** The amount of space remaining in the page */
773 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
775 /** The percentage of space used in the page, in tenths of a percent. */
776 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
777 ((env)->me_psize - PAGEHDRSZ))
778 /** The minimum page fill factor, in tenths of a percent.
779 * Pages emptier than this are candidates for merging.
781 #define FILL_THRESHOLD 250
783 /** Test if a page is a leaf page */
784 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
785 /** Test if a page is a LEAF2 page */
786 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
787 /** Test if a page is a branch page */
788 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
789 /** Test if a page is an overflow page */
790 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
791 /** Test if a page is a sub page */
792 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
794 /** The number of overflow pages needed to store the given size. */
795 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
797 /** Link in #MDB_txn.%mt_loose_pgs list */
798 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
800 /** Header for a single key/data pair within a page.
801 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
802 * We guarantee 2-byte alignment for 'MDB_node's.
804 typedef struct MDB_node {
805 /** lo and hi are used for data size on leaf nodes and for
806 * child pgno on branch nodes. On 64 bit platforms, flags
807 * is also used for pgno. (Branch nodes have no flags).
808 * They are in host byte order in case that lets some
809 * accesses be optimized into a 32-bit word access.
811 #if BYTE_ORDER == LITTLE_ENDIAN
812 unsigned short mn_lo, mn_hi; /**< part of data size or pgno */
814 unsigned short mn_hi, mn_lo;
816 /** @defgroup mdb_node Node Flags
818 * Flags for node headers.
821 #define F_BIGDATA 0x01 /**< data put on overflow page */
822 #define F_SUBDATA 0x02 /**< data is a sub-database */
823 #define F_DUPDATA 0x04 /**< data has duplicates */
825 /** valid flags for #mdb_node_add() */
826 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
829 unsigned short mn_flags; /**< @ref mdb_node */
830 unsigned short mn_ksize; /**< key size */
831 char mn_data[1]; /**< key and data are appended here */
834 /** Size of the node header, excluding dynamic data at the end */
835 #define NODESIZE offsetof(MDB_node, mn_data)
837 /** Bit position of top word in page number, for shifting mn_flags */
838 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
840 /** Size of a node in a branch page with a given key.
841 * This is just the node header plus the key, there is no data.
843 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
845 /** Size of a node in a leaf page with a given key and data.
846 * This is node header plus key plus data size.
848 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
850 /** Address of node \b i in page \b p */
851 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
853 /** Address of the key for the node */
854 #define NODEKEY(node) (void *)((node)->mn_data)
856 /** Address of the data for a node */
857 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
859 /** Get the page number pointed to by a branch node */
860 #define NODEPGNO(node) \
861 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
862 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
863 /** Set the page number in a branch node */
864 #define SETPGNO(node,pgno) do { \
865 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
866 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
868 /** Get the size of the data in a leaf node */
869 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
870 /** Set the size of the data for a leaf node */
871 #define SETDSZ(node,size) do { \
872 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
873 /** The size of a key in a node */
874 #define NODEKSZ(node) ((node)->mn_ksize)
876 /** Copy a page number from src to dst */
878 #define COPY_PGNO(dst,src) dst = src
880 #if SIZE_MAX > 4294967295UL
881 #define COPY_PGNO(dst,src) do { \
882 unsigned short *s, *d; \
883 s = (unsigned short *)&(src); \
884 d = (unsigned short *)&(dst); \
891 #define COPY_PGNO(dst,src) do { \
892 unsigned short *s, *d; \
893 s = (unsigned short *)&(src); \
894 d = (unsigned short *)&(dst); \
900 /** The address of a key in a LEAF2 page.
901 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
902 * There are no node headers, keys are stored contiguously.
904 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
906 /** Set the \b node's key into \b keyptr, if requested. */
907 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \
908 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
910 /** Set the \b node's key into \b key. */
911 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
913 /** Information about a single database in the environment. */
914 typedef struct MDB_db {
915 uint32_t md_pad; /**< also ksize for LEAF2 pages */
916 uint16_t md_flags; /**< @ref mdb_dbi_open */
917 uint16_t md_depth; /**< depth of this tree */
918 pgno_t md_branch_pages; /**< number of internal pages */
919 pgno_t md_leaf_pages; /**< number of leaf pages */
920 pgno_t md_overflow_pages; /**< number of overflow pages */
921 size_t md_entries; /**< number of data items */
922 pgno_t md_root; /**< the root page of this tree */
925 /** mdb_dbi_open flags */
926 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
927 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
928 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
929 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
931 /** Handle for the DB used to track free pages. */
933 /** Handle for the default DB. */
936 /** Meta page content.
937 * A meta page is the start point for accessing a database snapshot.
938 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
940 typedef struct MDB_meta {
941 /** Stamp identifying this as an LMDB file. It must be set
944 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */
946 void *mm_address; /**< address for fixed mapping */
947 size_t mm_mapsize; /**< size of mmap region */
948 MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */
949 /** The size of pages used in this DB */
950 #define mm_psize mm_dbs[0].md_pad
951 /** Any persistent environment flags. @ref mdb_env */
952 #define mm_flags mm_dbs[0].md_flags
953 pgno_t mm_last_pg; /**< last used page in file */
954 volatile txnid_t mm_txnid; /**< txnid that committed this page */
957 /** Buffer for a stack-allocated meta page.
958 * The members define size and alignment, and silence type
959 * aliasing warnings. They are not used directly; that could
960 * mean incorrectly using several union members in parallel.
962 typedef union MDB_metabuf {
965 char mm_pad[PAGEHDRSZ];
970 /** Auxiliary DB info.
971 * The information here is mostly static/read-only. There is
972 * only a single copy of this record in the environment.
974 typedef struct MDB_dbx {
975 MDB_val md_name; /**< name of the database */
976 MDB_cmp_func *md_cmp; /**< function for comparing keys */
977 MDB_cmp_func *md_dcmp; /**< function for comparing data items */
978 MDB_rel_func *md_rel; /**< user relocate function */
979 void *md_relctx; /**< user-provided context for md_rel */
982 /** A database transaction.
983 * Every operation requires a transaction handle.
986 MDB_txn *mt_parent; /**< parent of a nested txn */
987 MDB_txn *mt_child; /**< nested txn under this txn */
988 pgno_t mt_next_pgno; /**< next unallocated page */
989 /** The ID of this transaction. IDs are integers incrementing from 1.
990 * Only committed write transactions increment the ID. If a transaction
991 * aborts, the ID may be re-used by the next writer.
994 MDB_env *mt_env; /**< the DB environment */
995 /** The list of pages that became unused during this transaction.
998 /** The list of loose pages that became unused and may be reused
999 * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
1001 MDB_page *mt_loose_pgs;
1002 /* #Number of loose pages (#mt_loose_pgs) */
1004 /** The sorted list of dirty pages we temporarily wrote to disk
1005 * because the dirty list was full. page numbers in here are
1006 * shifted left by 1, deleted slots have the LSB set.
1008 MDB_IDL mt_spill_pgs;
1010 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
1011 MDB_ID2L dirty_list;
1012 /** For read txns: This thread/txn's reader table slot, or NULL. */
1015 /** Array of records for each DB known in the environment. */
1017 /** Array of MDB_db records for each known DB */
1019 /** Array of sequence numbers for each DB handle */
1020 unsigned int *mt_dbiseqs;
1021 /** @defgroup mt_dbflag Transaction DB Flags
1025 #define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
1026 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */
1027 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
1028 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
1030 /** In write txns, array of cursors for each DB */
1031 MDB_cursor **mt_cursors;
1032 /** Array of flags for each DB */
1033 unsigned char *mt_dbflags;
1034 /** Number of DB records in use. This number only ever increments;
1035 * we don't decrement it when individual DB handles are closed.
1039 /** @defgroup mdb_txn Transaction Flags
1043 #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */
1044 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */
1045 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
1046 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
1048 unsigned int mt_flags; /**< @ref mdb_txn */
1049 /** #dirty_list room: Array size - \#dirty pages visible to this txn.
1050 * Includes ancestor txns' dirty pages not hidden by other txns'
1051 * dirty/spilled pages. Thus commit(nested txn) has room to merge
1052 * dirty_list into mt_parent after freeing hidden mt_parent pages.
1054 unsigned int mt_dirty_room;
1057 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
1058 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
1059 * raise this on a 64 bit machine.
1061 #define CURSOR_STACK 32
1065 /** Cursors are used for all DB operations.
1066 * A cursor holds a path of (page pointer, key index) from the DB
1067 * root to a position in the DB, plus other state. #MDB_DUPSORT
1068 * cursors include an xcursor to the current data item. Write txns
1069 * track their cursors and keep them up to date when data moves.
1070 * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
1071 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
1074 /** Next cursor on this DB in this txn */
1075 MDB_cursor *mc_next;
1076 /** Backup of the original cursor if this cursor is a shadow */
1077 MDB_cursor *mc_backup;
1078 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
1079 struct MDB_xcursor *mc_xcursor;
1080 /** The transaction that owns this cursor */
1082 /** The database handle this cursor operates on */
1084 /** The database record for this cursor */
1086 /** The database auxiliary record for this cursor */
1088 /** The @ref mt_dbflag for this database */
1089 unsigned char *mc_dbflag;
1090 unsigned short mc_snum; /**< number of pushed pages */
1091 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
1092 /** @defgroup mdb_cursor Cursor Flags
1094 * Cursor state flags.
1097 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
1098 #define C_EOF 0x02 /**< No more data */
1099 #define C_SUB 0x04 /**< Cursor is a sub-cursor */
1100 #define C_DEL 0x08 /**< last op was a cursor_del */
1101 #define C_SPLITTING 0x20 /**< Cursor is in page_split */
1102 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */
1104 unsigned int mc_flags; /**< @ref mdb_cursor */
1105 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
1106 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
1109 /** Context for sorted-dup records.
1110 * We could have gone to a fully recursive design, with arbitrarily
1111 * deep nesting of sub-databases. But for now we only handle these
1112 * levels - main DB, optional sub-DB, sorted-duplicate DB.
1114 typedef struct MDB_xcursor {
1115 /** A sub-cursor for traversing the Dup DB */
1116 MDB_cursor mx_cursor;
1117 /** The database record for this Dup DB */
1119 /** The auxiliary DB record for this Dup DB */
1121 /** The @ref mt_dbflag for this Dup DB */
1122 unsigned char mx_dbflag;
1125 /** State of FreeDB old pages, stored in the MDB_env */
1126 typedef struct MDB_pgstate {
1127 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */
1128 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */
1131 /** The database environment. */
1133 HANDLE me_fd; /**< The main data file */
1134 HANDLE me_lfd; /**< The lock file */
1135 HANDLE me_mfd; /**< just for writing the meta pages */
1136 /** Failed to update the meta page. Probably an I/O error. */
1137 #define MDB_FATAL_ERROR 0x80000000U
1138 /** Some fields are initialized. */
1139 #define MDB_ENV_ACTIVE 0x20000000U
1140 /** me_txkey is set */
1141 #define MDB_ENV_TXKEY 0x10000000U
1142 uint32_t me_flags; /**< @ref mdb_env */
1143 unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1144 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
1145 unsigned int me_maxreaders; /**< size of the reader table */
1146 unsigned int me_numreaders; /**< max numreaders set by this env */
1147 MDB_dbi me_numdbs; /**< number of DBs opened */
1148 MDB_dbi me_maxdbs; /**< size of the DB table */
1149 MDB_PID_T me_pid; /**< process ID of this env */
1150 char *me_path; /**< path to the DB files */
1151 char *me_map; /**< the memory map of the data file */
1152 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
1153 MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1154 void *me_pbuf; /**< scratch area for DUPSORT put() */
1155 MDB_txn *me_txn; /**< current write transaction */
1156 MDB_txn *me_txn0; /**< prealloc'd write transaction */
1157 size_t me_mapsize; /**< size of the data memory map */
1158 size_t me_size; /**< current file size */
1159 pgno_t me_maxpg; /**< me_mapsize / me_psize */
1160 MDB_dbx *me_dbxs; /**< array of static DB info */
1161 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1162 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1163 pthread_key_t me_txkey; /**< thread-key for readers */
1164 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1165 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1166 # define me_pglast me_pgstate.mf_pglast
1167 # define me_pghead me_pgstate.mf_pghead
1168 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
1169 /** IDL of pages that became unused in a write txn */
1170 MDB_IDL me_free_pgs;
1171 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
1172 MDB_ID2L me_dirty_list;
1173 /** Max number of freelist items that can fit in a single overflow page */
1175 /** Max size of a node on a page */
1176 unsigned int me_nodemax;
1177 #if !(MDB_MAXKEYSIZE)
1178 unsigned int me_maxkey; /**< max size of a key */
1180 int me_live_reader; /**< have liveness lock in reader table */
1182 int me_pidquery; /**< Used in OpenProcess */
1184 #if defined(_WIN32) || defined(MDB_USE_SYSV_SEM)
1185 /* Windows mutexes/SysV semaphores do not reside in shared mem */
1186 mdb_mutex_t me_rmutex;
1187 mdb_mutex_t me_wmutex;
1189 void *me_userctx; /**< User-settable context */
1190 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1193 /** Nested transaction */
1194 typedef struct MDB_ntxn {
1195 MDB_txn mnt_txn; /**< the transaction */
1196 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1199 /** max number of pages to commit in one writev() call */
1200 #define MDB_COMMIT_PAGES 64
1201 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
1202 #undef MDB_COMMIT_PAGES
1203 #define MDB_COMMIT_PAGES IOV_MAX
1206 /** max bytes to write in one call */
1207 #define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4))
1209 /** Check \b txn and \b dbi arguments to a function */
1210 #define TXN_DBI_EXIST(txn, dbi) \
1211 ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID))
1213 /** Check for misused \b dbi handles */
1214 #define TXN_DBI_CHANGED(txn, dbi) \
1215 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1217 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1218 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1219 static int mdb_page_touch(MDB_cursor *mc);
1221 static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl);
1222 static int mdb_page_search_root(MDB_cursor *mc,
1223 MDB_val *key, int modify);
1224 #define MDB_PS_MODIFY 1
1225 #define MDB_PS_ROOTONLY 2
1226 #define MDB_PS_FIRST 4
1227 #define MDB_PS_LAST 8
1228 static int mdb_page_search(MDB_cursor *mc,
1229 MDB_val *key, int flags);
1230 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
1232 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
1233 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
1234 pgno_t newpgno, unsigned int nflags);
1236 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
1237 static int mdb_env_pick_meta(const MDB_env *env);
1238 static int mdb_env_write_meta(MDB_txn *txn);
1239 #if !(defined(_WIN32) || defined(MDB_USE_SYSV_SEM)) /* Drop unused excl arg */
1240 # define mdb_env_close0(env, excl) mdb_env_close1(env)
1242 static void mdb_env_close0(MDB_env *env, int excl);
1244 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
1245 static int mdb_node_add(MDB_cursor *mc, indx_t indx,
1246 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
1247 static void mdb_node_del(MDB_cursor *mc, int ksize);
1248 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
1249 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
1250 static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
1251 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
1252 static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
1254 static int mdb_rebalance(MDB_cursor *mc);
1255 static int mdb_update_key(MDB_cursor *mc, MDB_val *key);
1257 static void mdb_cursor_pop(MDB_cursor *mc);
1258 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
1260 static int mdb_cursor_del0(MDB_cursor *mc);
1261 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags);
1262 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right);
1263 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1264 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1265 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
1267 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1268 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1270 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
1271 static void mdb_xcursor_init0(MDB_cursor *mc);
1272 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
1274 static int mdb_drop0(MDB_cursor *mc, int subs);
1275 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
1276 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead);
1279 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
1283 static SECURITY_DESCRIPTOR mdb_null_sd;
1284 static SECURITY_ATTRIBUTES mdb_all_sa;
1285 static int mdb_sec_inited;
1288 /** Return the library version info. */
1290 mdb_version(int *major, int *minor, int *patch)
1292 if (major) *major = MDB_VERSION_MAJOR;
1293 if (minor) *minor = MDB_VERSION_MINOR;
1294 if (patch) *patch = MDB_VERSION_PATCH;
1295 return MDB_VERSION_STRING;
1298 /** Table of descriptions for LMDB @ref errors */
1299 static char *const mdb_errstr[] = {
1300 "MDB_KEYEXIST: Key/data pair already exists",
1301 "MDB_NOTFOUND: No matching key/data pair found",
1302 "MDB_PAGE_NOTFOUND: Requested page not found",
1303 "MDB_CORRUPTED: Located page was wrong type",
1304 "MDB_PANIC: Update of meta page failed or environment had fatal error",
1305 "MDB_VERSION_MISMATCH: Database environment version mismatch",
1306 "MDB_INVALID: File is not an LMDB file",
1307 "MDB_MAP_FULL: Environment mapsize limit reached",
1308 "MDB_DBS_FULL: Environment maxdbs limit reached",
1309 "MDB_READERS_FULL: Environment maxreaders limit reached",
1310 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1311 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1312 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1313 "MDB_PAGE_FULL: Internal error - page has no more space",
1314 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1315 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1316 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1317 "MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
1318 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1319 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1323 mdb_strerror(int err)
1326 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1327 * This works as long as no function between the call to mdb_strerror
1328 * and the actual use of the message uses more than 4K of stack.
1331 char buf[1024], *ptr = buf;
1335 return ("Successful return: 0");
1337 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
1338 i = err - MDB_KEYEXIST;
1339 return mdb_errstr[i];
1343 /* These are the C-runtime error codes we use. The comment indicates
1344 * their numeric value, and the Win32 error they would correspond to
1345 * if the error actually came from a Win32 API. A major mess, we should
1346 * have used LMDB-specific error codes for everything.
1349 case ENOENT: /* 2, FILE_NOT_FOUND */
1350 case EIO: /* 5, ACCESS_DENIED */
1351 case ENOMEM: /* 12, INVALID_ACCESS */
1352 case EACCES: /* 13, INVALID_DATA */
1353 case EBUSY: /* 16, CURRENT_DIRECTORY */
1354 case EINVAL: /* 22, BAD_COMMAND */
1355 case ENOSPC: /* 28, OUT_OF_PAPER */
1356 return strerror(err);
1361 FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM |
1362 FORMAT_MESSAGE_IGNORE_INSERTS,
1363 NULL, err, 0, ptr, sizeof(buf), (va_list *)pad);
1366 return strerror(err);
1370 /** assert(3) variant in cursor context */
1371 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1372 /** assert(3) variant in transaction context */
1373 #define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr)
1374 /** assert(3) variant in environment context */
1375 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr)
1378 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1379 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__))
1382 mdb_assert_fail(MDB_env *env, const char *expr_txt,
1383 const char *func, const char *file, int line)
1386 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1387 file, line, expr_txt, func);
1388 if (env->me_assert_func)
1389 env->me_assert_func(env, buf);
1390 fprintf(stderr, "%s\n", buf);
1394 # define mdb_assert0(env, expr, expr_txt) ((void) 0)
1398 /** Return the page number of \b mp which may be sub-page, for debug output */
1400 mdb_dbg_pgno(MDB_page *mp)
1403 COPY_PGNO(ret, mp->mp_pgno);
1407 /** Display a key in hexadecimal and return the address of the result.
1408 * @param[in] key the key to display
1409 * @param[in] buf the buffer to write into. Should always be #DKBUF.
1410 * @return The key in hexadecimal form.
1413 mdb_dkey(MDB_val *key, char *buf)
1416 unsigned char *c = key->mv_data;
1422 if (key->mv_size > DKBUF_MAXKEYSIZE)
1423 return "MDB_MAXKEYSIZE";
1424 /* may want to make this a dynamic check: if the key is mostly
1425 * printable characters, print it as-is instead of converting to hex.
1429 for (i=0; i<key->mv_size; i++)
1430 ptr += sprintf(ptr, "%02x", *c++);
1432 sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1438 mdb_leafnode_type(MDB_node *n)
1440 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1441 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1442 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1445 /** Display all the keys in the page. */
1447 mdb_page_list(MDB_page *mp)
1449 pgno_t pgno = mdb_dbg_pgno(mp);
1450 const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1452 unsigned int i, nkeys, nsize, total = 0;
1456 switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) {
1457 case P_BRANCH: type = "Branch page"; break;
1458 case P_LEAF: type = "Leaf page"; break;
1459 case P_LEAF|P_SUBP: type = "Sub-page"; break;
1460 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
1461 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
1463 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n",
1464 pgno, mp->mp_pages, state);
1467 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n",
1468 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid);
1471 fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags);
1475 nkeys = NUMKEYS(mp);
1476 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state);
1478 for (i=0; i<nkeys; i++) {
1479 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
1480 key.mv_size = nsize = mp->mp_pad;
1481 key.mv_data = LEAF2KEY(mp, i, nsize);
1483 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1486 node = NODEPTR(mp, i);
1487 key.mv_size = node->mn_ksize;
1488 key.mv_data = node->mn_data;
1489 nsize = NODESIZE + key.mv_size;
1490 if (IS_BRANCH(mp)) {
1491 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
1495 if (F_ISSET(node->mn_flags, F_BIGDATA))
1496 nsize += sizeof(pgno_t);
1498 nsize += NODEDSZ(node);
1500 nsize += sizeof(indx_t);
1501 fprintf(stderr, "key %d: nsize %d, %s%s\n",
1502 i, nsize, DKEY(&key), mdb_leafnode_type(node));
1504 total = EVEN(total);
1506 fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1507 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1511 mdb_cursor_chk(MDB_cursor *mc)
1517 if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
1518 for (i=0; i<mc->mc_top; i++) {
1520 node = NODEPTR(mp, mc->mc_ki[i]);
1521 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1524 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1530 /** Count all the pages in each DB and in the freelist
1531 * and make sure it matches the actual number of pages
1533 * All named DBs must be open for a correct count.
1535 static void mdb_audit(MDB_txn *txn)
1539 MDB_ID freecount, count;
1544 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1545 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1546 freecount += *(MDB_ID *)data.mv_data;
1547 mdb_tassert(txn, rc == MDB_NOTFOUND);
1550 for (i = 0; i<txn->mt_numdbs; i++) {
1552 if (!(txn->mt_dbflags[i] & DB_VALID))
1554 mdb_cursor_init(&mc, txn, i, &mx);
1555 if (txn->mt_dbs[i].md_root == P_INVALID)
1557 count += txn->mt_dbs[i].md_branch_pages +
1558 txn->mt_dbs[i].md_leaf_pages +
1559 txn->mt_dbs[i].md_overflow_pages;
1560 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1561 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1562 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) {
1565 mp = mc.mc_pg[mc.mc_top];
1566 for (j=0; j<NUMKEYS(mp); j++) {
1567 MDB_node *leaf = NODEPTR(mp, j);
1568 if (leaf->mn_flags & F_SUBDATA) {
1570 memcpy(&db, NODEDATA(leaf), sizeof(db));
1571 count += db.md_branch_pages + db.md_leaf_pages +
1572 db.md_overflow_pages;
1576 mdb_tassert(txn, rc == MDB_NOTFOUND);
1579 if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) {
1580 fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n",
1581 txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno);
1587 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1589 return txn->mt_dbxs[dbi].md_cmp(a, b);
1593 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1595 return txn->mt_dbxs[dbi].md_dcmp(a, b);
1598 /** Allocate memory for a page.
1599 * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1602 mdb_page_malloc(MDB_txn *txn, unsigned num)
1604 MDB_env *env = txn->mt_env;
1605 MDB_page *ret = env->me_dpages;
1606 size_t psize = env->me_psize, sz = psize, off;
1607 /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1608 * For a single page alloc, we init everything after the page header.
1609 * For multi-page, we init the final page; if the caller needed that
1610 * many pages they will be filling in at least up to the last page.
1614 VGMEMP_ALLOC(env, ret, sz);
1615 VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1616 env->me_dpages = ret->mp_next;
1619 psize -= off = PAGEHDRSZ;
1624 if ((ret = malloc(sz)) != NULL) {
1625 VGMEMP_ALLOC(env, ret, sz);
1626 if (!(env->me_flags & MDB_NOMEMINIT)) {
1627 memset((char *)ret + off, 0, psize);
1631 txn->mt_flags |= MDB_TXN_ERROR;
1635 /** Free a single page.
1636 * Saves single pages to a list, for future reuse.
1637 * (This is not used for multi-page overflow pages.)
1640 mdb_page_free(MDB_env *env, MDB_page *mp)
1642 mp->mp_next = env->me_dpages;
1643 VGMEMP_FREE(env, mp);
1644 env->me_dpages = mp;
1647 /** Free a dirty page */
1649 mdb_dpage_free(MDB_env *env, MDB_page *dp)
1651 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1652 mdb_page_free(env, dp);
1654 /* large pages just get freed directly */
1655 VGMEMP_FREE(env, dp);
1660 /** Return all dirty pages to dpage list */
1662 mdb_dlist_free(MDB_txn *txn)
1664 MDB_env *env = txn->mt_env;
1665 MDB_ID2L dl = txn->mt_u.dirty_list;
1666 unsigned i, n = dl[0].mid;
1668 for (i = 1; i <= n; i++) {
1669 mdb_dpage_free(env, dl[i].mptr);
1674 /** Loosen or free a single page.
1675 * Saves single pages to a list for future reuse
1676 * in this same txn. It has been pulled from the freeDB
1677 * and already resides on the dirty list, but has been
1678 * deleted. Use these pages first before pulling again
1681 * If the page wasn't dirtied in this txn, just add it
1682 * to this txn's free list.
1685 mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
1688 pgno_t pgno = mp->mp_pgno;
1689 MDB_txn *txn = mc->mc_txn;
1691 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1692 if (txn->mt_parent) {
1693 MDB_ID2 *dl = txn->mt_u.dirty_list;
1694 /* If txn has a parent, make sure the page is in our
1698 unsigned x = mdb_mid2l_search(dl, pgno);
1699 if (x <= dl[0].mid && dl[x].mid == pgno) {
1700 if (mp != dl[x].mptr) { /* bad cursor? */
1701 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1702 txn->mt_flags |= MDB_TXN_ERROR;
1703 return MDB_CORRUPTED;
1710 /* no parent txn, so it's just ours */
1715 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1717 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1718 txn->mt_loose_pgs = mp;
1719 txn->mt_loose_count++;
1720 mp->mp_flags |= P_LOOSE;
1722 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1730 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1731 * @param[in] mc A cursor handle for the current operation.
1732 * @param[in] pflags Flags of the pages to update:
1733 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
1734 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
1735 * @return 0 on success, non-zero on failure.
1738 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1740 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1741 MDB_txn *txn = mc->mc_txn;
1747 int rc = MDB_SUCCESS, level;
1749 /* Mark pages seen by cursors */
1750 if (mc->mc_flags & C_UNTRACK)
1751 mc = NULL; /* will find mc in mt_cursors */
1752 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1753 for (; mc; mc=mc->mc_next) {
1754 if (!(mc->mc_flags & C_INITIALIZED))
1756 for (m3 = mc;; m3 = &mx->mx_cursor) {
1758 for (j=0; j<m3->mc_snum; j++) {
1760 if ((mp->mp_flags & Mask) == pflags)
1761 mp->mp_flags ^= P_KEEP;
1763 mx = m3->mc_xcursor;
1764 /* Proceed to mx if it is at a sub-database */
1765 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1767 if (! (mp && (mp->mp_flags & P_LEAF)))
1769 leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1770 if (!(leaf->mn_flags & F_SUBDATA))
1779 /* Mark dirty root pages */
1780 for (i=0; i<txn->mt_numdbs; i++) {
1781 if (txn->mt_dbflags[i] & DB_DIRTY) {
1782 pgno_t pgno = txn->mt_dbs[i].md_root;
1783 if (pgno == P_INVALID)
1785 if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
1787 if ((dp->mp_flags & Mask) == pflags && level <= 1)
1788 dp->mp_flags ^= P_KEEP;
1796 static int mdb_page_flush(MDB_txn *txn, int keep);
1798 /** Spill pages from the dirty list back to disk.
1799 * This is intended to prevent running into #MDB_TXN_FULL situations,
1800 * but note that they may still occur in a few cases:
1801 * 1) our estimate of the txn size could be too small. Currently this
1802 * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1803 * 2) child txns may run out of space if their parents dirtied a
1804 * lot of pages and never spilled them. TODO: we probably should do
1805 * a preemptive spill during #mdb_txn_begin() of a child txn, if
1806 * the parent's dirty_room is below a given threshold.
1808 * Otherwise, if not using nested txns, it is expected that apps will
1809 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
1810 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
1811 * If the txn never references them again, they can be left alone.
1812 * If the txn only reads them, they can be used without any fuss.
1813 * If the txn writes them again, they can be dirtied immediately without
1814 * going thru all of the work of #mdb_page_touch(). Such references are
1815 * handled by #mdb_page_unspill().
1817 * Also note, we never spill DB root pages, nor pages of active cursors,
1818 * because we'll need these back again soon anyway. And in nested txns,
1819 * we can't spill a page in a child txn if it was already spilled in a
1820 * parent txn. That would alter the parent txns' data even though
1821 * the child hasn't committed yet, and we'd have no way to undo it if
1822 * the child aborted.
1824 * @param[in] m0 cursor A cursor handle identifying the transaction and
1825 * database for which we are checking space.
1826 * @param[in] key For a put operation, the key being stored.
1827 * @param[in] data For a put operation, the data being stored.
1828 * @return 0 on success, non-zero on failure.
1831 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1833 MDB_txn *txn = m0->mc_txn;
1835 MDB_ID2L dl = txn->mt_u.dirty_list;
1836 unsigned int i, j, need;
1839 if (m0->mc_flags & C_SUB)
1842 /* Estimate how much space this op will take */
1843 i = m0->mc_db->md_depth;
1844 /* Named DBs also dirty the main DB */
1845 if (m0->mc_dbi > MAIN_DBI)
1846 i += txn->mt_dbs[MAIN_DBI].md_depth;
1847 /* For puts, roughly factor in the key+data size */
1849 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
1850 i += i; /* double it for good measure */
1853 if (txn->mt_dirty_room > i)
1856 if (!txn->mt_spill_pgs) {
1857 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX);
1858 if (!txn->mt_spill_pgs)
1861 /* purge deleted slots */
1862 MDB_IDL sl = txn->mt_spill_pgs;
1863 unsigned int num = sl[0];
1865 for (i=1; i<=num; i++) {
1872 /* Preserve pages which may soon be dirtied again */
1873 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
1876 /* Less aggressive spill - we originally spilled the entire dirty list,
1877 * with a few exceptions for cursor pages and DB root pages. But this
1878 * turns out to be a lot of wasted effort because in a large txn many
1879 * of those pages will need to be used again. So now we spill only 1/8th
1880 * of the dirty pages. Testing revealed this to be a good tradeoff,
1881 * better than 1/2, 1/4, or 1/10.
1883 if (need < MDB_IDL_UM_MAX / 8)
1884 need = MDB_IDL_UM_MAX / 8;
1886 /* Save the page IDs of all the pages we're flushing */
1887 /* flush from the tail forward, this saves a lot of shifting later on. */
1888 for (i=dl[0].mid; i && need; i--) {
1889 MDB_ID pn = dl[i].mid << 1;
1891 if (dp->mp_flags & (P_LOOSE|P_KEEP))
1893 /* Can't spill twice, make sure it's not already in a parent's
1896 if (txn->mt_parent) {
1898 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1899 if (tx2->mt_spill_pgs) {
1900 j = mdb_midl_search(tx2->mt_spill_pgs, pn);
1901 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
1902 dp->mp_flags |= P_KEEP;
1910 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn)))
1914 mdb_midl_sort(txn->mt_spill_pgs);
1916 /* Flush the spilled part of dirty list */
1917 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS)
1920 /* Reset any dirty pages we kept that page_flush didn't see */
1921 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
1924 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
1928 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
1930 mdb_find_oldest(MDB_txn *txn)
1933 txnid_t mr, oldest = txn->mt_txnid - 1;
1934 if (txn->mt_env->me_txns) {
1935 MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1936 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1947 /** Add a page to the txn's dirty list */
1949 mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
1952 int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
1954 if (txn->mt_env->me_flags & MDB_WRITEMAP) {
1955 insert = mdb_mid2l_append;
1957 insert = mdb_mid2l_insert;
1959 mid.mid = mp->mp_pgno;
1961 rc = insert(txn->mt_u.dirty_list, &mid);
1962 mdb_tassert(txn, rc == 0);
1963 txn->mt_dirty_room--;
1966 /** Allocate page numbers and memory for writing. Maintain me_pglast,
1967 * me_pghead and mt_next_pgno.
1969 * If there are free pages available from older transactions, they
1970 * are re-used first. Otherwise allocate a new page at mt_next_pgno.
1971 * Do not modify the freedB, just merge freeDB records into me_pghead[]
1972 * and move me_pglast to say which records were consumed. Only this
1973 * function can create me_pghead and move me_pglast/mt_next_pgno.
1974 * @param[in] mc cursor A cursor handle identifying the transaction and
1975 * database for which we are allocating.
1976 * @param[in] num the number of pages to allocate.
1977 * @param[out] mp Address of the allocated page(s). Requests for multiple pages
1978 * will always be satisfied by a single contiguous chunk of memory.
1979 * @return 0 on success, non-zero on failure.
1982 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1984 #ifdef MDB_PARANOID /* Seems like we can ignore this now */
1985 /* Get at most <Max_retries> more freeDB records once me_pghead
1986 * has enough pages. If not enough, use new pages from the map.
1987 * If <Paranoid> and mc is updating the freeDB, only get new
1988 * records if me_pghead is empty. Then the freelist cannot play
1989 * catch-up with itself by growing while trying to save it.
1991 enum { Paranoid = 1, Max_retries = 500 };
1993 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
1995 int rc, retry = num * 60;
1996 MDB_txn *txn = mc->mc_txn;
1997 MDB_env *env = txn->mt_env;
1998 pgno_t pgno, *mop = env->me_pghead;
1999 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
2001 txnid_t oldest = 0, last;
2006 /* If there are any loose pages, just use them */
2007 if (num == 1 && txn->mt_loose_pgs) {
2008 np = txn->mt_loose_pgs;
2009 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
2010 txn->mt_loose_count--;
2011 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
2019 /* If our dirty list is already full, we can't do anything */
2020 if (txn->mt_dirty_room == 0) {
2025 for (op = MDB_FIRST;; op = MDB_NEXT) {
2030 /* Seek a big enough contiguous page range. Prefer
2031 * pages at the tail, just truncating the list.
2037 if (mop[i-n2] == pgno+n2)
2044 if (op == MDB_FIRST) { /* 1st iteration */
2045 /* Prepare to fetch more and coalesce */
2046 last = env->me_pglast;
2047 oldest = env->me_pgoldest;
2048 mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
2051 key.mv_data = &last; /* will look up last+1 */
2052 key.mv_size = sizeof(last);
2054 if (Paranoid && mc->mc_dbi == FREE_DBI)
2057 if (Paranoid && retry < 0 && mop_len)
2061 /* Do not fetch more if the record will be too recent */
2062 if (oldest <= last) {
2064 oldest = mdb_find_oldest(txn);
2065 env->me_pgoldest = oldest;
2071 rc = mdb_cursor_get(&m2, &key, NULL, op);
2073 if (rc == MDB_NOTFOUND)
2077 last = *(txnid_t*)key.mv_data;
2078 if (oldest <= last) {
2080 oldest = mdb_find_oldest(txn);
2081 env->me_pgoldest = oldest;
2087 np = m2.mc_pg[m2.mc_top];
2088 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2089 if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
2092 idl = (MDB_ID *) data.mv_data;
2095 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
2100 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
2102 mop = env->me_pghead;
2104 env->me_pglast = last;
2106 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
2107 last, txn->mt_dbs[FREE_DBI].md_root, i));
2109 DPRINTF(("IDL %"Z"u", idl[j]));
2111 /* Merge in descending sorted order */
2112 mdb_midl_xmerge(mop, idl);
2116 /* Use new pages from the map when nothing suitable in the freeDB */
2118 pgno = txn->mt_next_pgno;
2119 if (pgno + num >= env->me_maxpg) {
2120 DPUTS("DB size maxed out");
2126 if (env->me_flags & MDB_WRITEMAP) {
2127 np = (MDB_page *)(env->me_map + env->me_psize * pgno);
2129 if (!(np = mdb_page_malloc(txn, num))) {
2135 mop[0] = mop_len -= num;
2136 /* Move any stragglers down */
2137 for (j = i-num; j < mop_len; )
2138 mop[++j] = mop[++i];
2140 txn->mt_next_pgno = pgno + num;
2143 mdb_page_dirty(txn, np);
2149 txn->mt_flags |= MDB_TXN_ERROR;
2153 /** Copy the used portions of a non-overflow page.
2154 * @param[in] dst page to copy into
2155 * @param[in] src page to copy from
2156 * @param[in] psize size of a page
2159 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
2161 enum { Align = sizeof(pgno_t) };
2162 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2164 /* If page isn't full, just copy the used portion. Adjust
2165 * alignment so memcpy may copy words instead of bytes.
2167 if ((unused &= -Align) && !IS_LEAF2(src)) {
2168 upper = (upper + PAGEBASE) & -Align;
2169 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2170 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2173 memcpy(dst, src, psize - unused);
2177 /** Pull a page off the txn's spill list, if present.
2178 * If a page being referenced was spilled to disk in this txn, bring
2179 * it back and make it dirty/writable again.
2180 * @param[in] txn the transaction handle.
2181 * @param[in] mp the page being referenced. It must not be dirty.
2182 * @param[out] ret the writable page, if any. ret is unchanged if
2183 * mp wasn't spilled.
2186 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
2188 MDB_env *env = txn->mt_env;
2191 pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2193 for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2194 if (!tx2->mt_spill_pgs)
2196 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
2197 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2200 if (txn->mt_dirty_room == 0)
2201 return MDB_TXN_FULL;
2202 if (IS_OVERFLOW(mp))
2206 if (env->me_flags & MDB_WRITEMAP) {
2209 np = mdb_page_malloc(txn, num);
2213 memcpy(np, mp, num * env->me_psize);
2215 mdb_page_copy(np, mp, env->me_psize);
2218 /* If in current txn, this page is no longer spilled.
2219 * If it happens to be the last page, truncate the spill list.
2220 * Otherwise mark it as deleted by setting the LSB.
2222 if (x == txn->mt_spill_pgs[0])
2223 txn->mt_spill_pgs[0]--;
2225 txn->mt_spill_pgs[x] |= 1;
2226 } /* otherwise, if belonging to a parent txn, the
2227 * page remains spilled until child commits
2230 mdb_page_dirty(txn, np);
2231 np->mp_flags |= P_DIRTY;
2239 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
2240 * @param[in] mc cursor pointing to the page to be touched
2241 * @return 0 on success, non-zero on failure.
2244 mdb_page_touch(MDB_cursor *mc)
2246 MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2247 MDB_txn *txn = mc->mc_txn;
2248 MDB_cursor *m2, *m3;
2252 if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2253 if (txn->mt_flags & MDB_TXN_SPILLS) {
2255 rc = mdb_page_unspill(txn, mp, &np);
2261 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) ||
2262 (rc = mdb_page_alloc(mc, 1, &np)))
2265 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
2266 mp->mp_pgno, pgno));
2267 mdb_cassert(mc, mp->mp_pgno != pgno);
2268 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2269 /* Update the parent page, if any, to point to the new page */
2271 MDB_page *parent = mc->mc_pg[mc->mc_top-1];
2272 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2273 SETPGNO(node, pgno);
2275 mc->mc_db->md_root = pgno;
2277 } else if (txn->mt_parent && !IS_SUBP(mp)) {
2278 MDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2280 /* If txn has a parent, make sure the page is in our
2284 unsigned x = mdb_mid2l_search(dl, pgno);
2285 if (x <= dl[0].mid && dl[x].mid == pgno) {
2286 if (mp != dl[x].mptr) { /* bad cursor? */
2287 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2288 txn->mt_flags |= MDB_TXN_ERROR;
2289 return MDB_CORRUPTED;
2294 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX);
2296 np = mdb_page_malloc(txn, 1);
2301 rc = mdb_mid2l_insert(dl, &mid);
2302 mdb_cassert(mc, rc == 0);
2307 mdb_page_copy(np, mp, txn->mt_env->me_psize);
2309 np->mp_flags |= P_DIRTY;
2312 /* Adjust cursors pointing to mp */
2313 mc->mc_pg[mc->mc_top] = np;
2314 m2 = txn->mt_cursors[mc->mc_dbi];
2315 if (mc->mc_flags & C_SUB) {
2316 for (; m2; m2=m2->mc_next) {
2317 m3 = &m2->mc_xcursor->mx_cursor;
2318 if (m3->mc_snum < mc->mc_snum) continue;
2319 if (m3->mc_pg[mc->mc_top] == mp)
2320 m3->mc_pg[mc->mc_top] = np;
2323 for (; m2; m2=m2->mc_next) {
2324 if (m2->mc_snum < mc->mc_snum) continue;
2325 if (m2->mc_pg[mc->mc_top] == mp) {
2326 m2->mc_pg[mc->mc_top] = np;
2327 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
2329 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
2331 MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]);
2332 if (!(leaf->mn_flags & F_SUBDATA))
2333 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
2341 txn->mt_flags |= MDB_TXN_ERROR;
2345 /* internal env_sync flags: */
2346 #define FORCE 1 /* as before, force a flush */
2347 #define FGREW 0x8000 /* file has grown, do a full fsync instead of just
2348 fdatasync. We shouldn't have to do this, according to the POSIX spec.
2349 But common Linux FSs violate the spec and won't sync required metadata
2350 correctly when the file grows. This only makes a difference if the
2351 platform actually distinguishes fdatasync from fsync.
2352 http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */
2355 mdb_env_sync0(MDB_env *env, int flag)
2357 int rc = 0, force = flag & FORCE;
2358 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
2359 if (env->me_flags & MDB_WRITEMAP) {
2360 int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
2361 ? MS_ASYNC : MS_SYNC;
2362 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
2365 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
2369 #ifdef HAVE_FDATASYNC
2371 if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */
2375 if (MDB_FDATASYNC(env->me_fd))
2383 mdb_env_sync(MDB_env *env, int force)
2385 return mdb_env_sync0(env, force != 0);
2388 /** Back up parent txn's cursors, then grab the originals for tracking */
2390 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
2392 MDB_cursor *mc, *bk;
2397 for (i = src->mt_numdbs; --i >= 0; ) {
2398 if ((mc = src->mt_cursors[i]) != NULL) {
2399 size = sizeof(MDB_cursor);
2401 size += sizeof(MDB_xcursor);
2402 for (; mc; mc = bk->mc_next) {
2408 mc->mc_db = &dst->mt_dbs[i];
2409 /* Kill pointers into src - and dst to reduce abuse: The
2410 * user may not use mc until dst ends. Otherwise we'd...
2412 mc->mc_txn = NULL; /* ...set this to dst */
2413 mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */
2414 if ((mx = mc->mc_xcursor) != NULL) {
2415 *(MDB_xcursor *)(bk+1) = *mx;
2416 mx->mx_cursor.mc_txn = NULL; /* ...and dst. */
2418 mc->mc_next = dst->mt_cursors[i];
2419 dst->mt_cursors[i] = mc;
2426 /** Close this write txn's cursors, give parent txn's cursors back to parent.
2427 * @param[in] txn the transaction handle.
2428 * @param[in] merge true to keep changes to parent cursors, false to revert.
2429 * @return 0 on success, non-zero on failure.
2432 mdb_cursors_close(MDB_txn *txn, unsigned merge)
2434 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2438 for (i = txn->mt_numdbs; --i >= 0; ) {
2439 for (mc = cursors[i]; mc; mc = next) {
2441 if ((bk = mc->mc_backup) != NULL) {
2443 /* Commit changes to parent txn */
2444 mc->mc_next = bk->mc_next;
2445 mc->mc_backup = bk->mc_backup;
2446 mc->mc_txn = bk->mc_txn;
2447 mc->mc_db = bk->mc_db;
2448 mc->mc_dbflag = bk->mc_dbflag;
2449 if ((mx = mc->mc_xcursor) != NULL)
2450 mx->mx_cursor.mc_txn = bk->mc_txn;
2452 /* Abort nested txn */
2454 if ((mx = mc->mc_xcursor) != NULL)
2455 *mx = *(MDB_xcursor *)(bk+1);
2459 /* Only malloced cursors are permanently tracked. */
2467 #define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn)
2470 mdb_txn_reset0(MDB_txn *txn, const char *act);
2472 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2478 Pidset = F_SETLK, Pidcheck = F_GETLK
2482 /** Set or check a pid lock. Set returns 0 on success.
2483 * Check returns 0 if the process is certainly dead, nonzero if it may
2484 * be alive (the lock exists or an error happened so we do not know).
2486 * On Windows Pidset is a no-op, we merely check for the existence
2487 * of the process with the given pid. On POSIX we use a single byte
2488 * lock on the lockfile, set at an offset equal to the pid.
2491 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2493 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2496 if (op == Pidcheck) {
2497 h = OpenProcess(env->me_pidquery, FALSE, pid);
2498 /* No documented "no such process" code, but other program use this: */
2500 return ErrCode() != ERROR_INVALID_PARAMETER;
2501 /* A process exists until all handles to it close. Has it exited? */
2502 ret = WaitForSingleObject(h, 0) != 0;
2509 struct flock lock_info;
2510 memset(&lock_info, 0, sizeof(lock_info));
2511 lock_info.l_type = F_WRLCK;
2512 lock_info.l_whence = SEEK_SET;
2513 lock_info.l_start = pid;
2514 lock_info.l_len = 1;
2515 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2516 if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2518 } else if ((rc = ErrCode()) == EINTR) {
2526 /** Common code for #mdb_txn_begin() and #mdb_txn_renew().
2527 * @param[in] txn the transaction handle to initialize
2528 * @return 0 on success, non-zero on failure.
2531 mdb_txn_renew0(MDB_txn *txn)
2533 MDB_env *env = txn->mt_env;
2534 MDB_txninfo *ti = env->me_txns;
2538 int rc, new_notls = 0;
2540 if (txn->mt_flags & MDB_TXN_RDONLY) {
2542 txn->mt_numdbs = env->me_numdbs;
2543 txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
2545 meta = env->me_metas[ mdb_env_pick_meta(env) ];
2546 txn->mt_txnid = meta->mm_txnid;
2547 txn->mt_u.reader = NULL;
2549 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
2550 pthread_getspecific(env->me_txkey);
2552 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2553 return MDB_BAD_RSLOT;
2555 MDB_PID_T pid = env->me_pid;
2556 MDB_THR_T tid = pthread_self();
2557 mdb_mutex_t *rmutex = MDB_MUTEX(env, r);
2559 if (!env->me_live_reader) {
2560 rc = mdb_reader_pid(env, Pidset, pid);
2563 env->me_live_reader = 1;
2566 if (LOCK_MUTEX(rc, env, rmutex))
2568 nr = ti->mti_numreaders;
2569 for (i=0; i<nr; i++)
2570 if (ti->mti_readers[i].mr_pid == 0)
2572 if (i == env->me_maxreaders) {
2573 UNLOCK_MUTEX(rmutex);
2574 return MDB_READERS_FULL;
2576 r = &ti->mti_readers[i];
2577 r->mr_txnid = (txnid_t)-1;
2579 r->mr_pid = pid; /* should be written last, see ITS#7971. */
2581 ti->mti_numreaders = ++nr;
2582 /* Save numreaders for un-mutexed mdb_env_close() */
2583 env->me_numreaders = nr;
2584 UNLOCK_MUTEX(rmutex);
2586 new_notls = (env->me_flags & MDB_NOTLS);
2587 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2592 do /* LY: Retry on a race, ITS#7970. */
2593 r->mr_txnid = ti->mti_txnid;
2594 while(r->mr_txnid != ti->mti_txnid);
2595 txn->mt_txnid = r->mr_txnid;
2596 txn->mt_u.reader = r;
2597 meta = env->me_metas[txn->mt_txnid & 1];
2601 if (LOCK_MUTEX(rc, env, MDB_MUTEX(env, w)))
2603 #ifdef MDB_USE_SYSV_SEM
2604 meta = env->me_metas[ mdb_env_pick_meta(env) ];
2605 txn->mt_txnid = meta->mm_txnid;
2606 /* Update mti_txnid like mdb_mutex_failed() would,
2607 * in case last writer crashed before updating it.
2609 ti->mti_txnid = txn->mt_txnid;
2611 txn->mt_txnid = ti->mti_txnid;
2612 meta = env->me_metas[txn->mt_txnid & 1];
2615 meta = env->me_metas[ mdb_env_pick_meta(env) ];
2616 txn->mt_txnid = meta->mm_txnid;
2619 txn->mt_numdbs = env->me_numdbs;
2622 if (txn->mt_txnid == mdb_debug_start)
2626 txn->mt_child = NULL;
2627 txn->mt_loose_pgs = NULL;
2628 txn->mt_loose_count = 0;
2629 txn->mt_dirty_room = MDB_IDL_UM_MAX;
2630 txn->mt_u.dirty_list = env->me_dirty_list;
2631 txn->mt_u.dirty_list[0].mid = 0;
2632 txn->mt_free_pgs = env->me_free_pgs;
2633 txn->mt_free_pgs[0] = 0;
2634 txn->mt_spill_pgs = NULL;
2636 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2639 /* Copy the DB info and flags */
2640 memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
2642 /* Moved to here to avoid a data race in read TXNs */
2643 txn->mt_next_pgno = meta->mm_last_pg+1;
2645 for (i=2; i<txn->mt_numdbs; i++) {
2646 x = env->me_dbflags[i];
2647 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
2648 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0;
2650 txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID;
2652 if (env->me_maxpg < txn->mt_next_pgno) {
2653 mdb_txn_reset0(txn, "renew0-mapfail");
2655 txn->mt_u.reader->mr_pid = 0;
2656 txn->mt_u.reader = NULL;
2658 return MDB_MAP_RESIZED;
2665 mdb_txn_renew(MDB_txn *txn)
2669 if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */
2672 if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
2673 DPUTS("environment had fatal error, must shutdown!");
2677 rc = mdb_txn_renew0(txn);
2678 if (rc == MDB_SUCCESS) {
2679 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2680 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2681 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
2687 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2691 int rc, size, tsize = sizeof(MDB_txn);
2693 if (env->me_flags & MDB_FATAL_ERROR) {
2694 DPUTS("environment had fatal error, must shutdown!");
2697 if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY))
2700 /* Nested transactions: Max 1 child, write txns only, no writemap */
2701 if (parent->mt_child ||
2702 (flags & MDB_RDONLY) ||
2703 (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) ||
2704 (env->me_flags & MDB_WRITEMAP))
2706 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
2708 tsize = sizeof(MDB_ntxn);
2711 if (!(flags & MDB_RDONLY)) {
2713 txn = env->me_txn0; /* just reuse preallocated write txn */
2716 /* child txns use own copy of cursors */
2717 size += env->me_maxdbs * sizeof(MDB_cursor *);
2719 size += env->me_maxdbs * (sizeof(MDB_db)+1);
2721 if ((txn = calloc(1, size)) == NULL) {
2722 DPRINTF(("calloc: %s", strerror(errno)));
2725 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2726 if (flags & MDB_RDONLY) {
2727 txn->mt_flags |= MDB_TXN_RDONLY;
2728 txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
2729 txn->mt_dbiseqs = env->me_dbiseqs;
2731 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2733 txn->mt_dbiseqs = parent->mt_dbiseqs;
2734 txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2736 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
2737 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
2745 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
2746 if (!txn->mt_u.dirty_list ||
2747 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
2749 free(txn->mt_u.dirty_list);
2753 txn->mt_txnid = parent->mt_txnid;
2754 txn->mt_dirty_room = parent->mt_dirty_room;
2755 txn->mt_u.dirty_list[0].mid = 0;
2756 txn->mt_spill_pgs = NULL;
2757 txn->mt_next_pgno = parent->mt_next_pgno;
2758 parent->mt_child = txn;
2759 txn->mt_parent = parent;
2760 txn->mt_numdbs = parent->mt_numdbs;
2761 txn->mt_flags = parent->mt_flags;
2762 txn->mt_dbxs = parent->mt_dbxs;
2763 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2764 /* Copy parent's mt_dbflags, but clear DB_NEW */
2765 for (i=0; i<txn->mt_numdbs; i++)
2766 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
2768 ntxn = (MDB_ntxn *)txn;
2769 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
2770 if (env->me_pghead) {
2771 size = MDB_IDL_SIZEOF(env->me_pghead);
2772 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
2774 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
2779 rc = mdb_cursor_shadow(parent, txn);
2781 mdb_txn_reset0(txn, "beginchild-fail");
2783 rc = mdb_txn_renew0(txn);
2786 if (txn != env->me_txn0)
2790 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2791 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2792 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
2799 mdb_txn_env(MDB_txn *txn)
2801 if(!txn) return NULL;
2806 mdb_txn_id(MDB_txn *txn)
2809 return txn->mt_txnid;
2812 /** Export or close DBI handles opened in this txn. */
2814 mdb_dbis_update(MDB_txn *txn, int keep)
2817 MDB_dbi n = txn->mt_numdbs;
2818 MDB_env *env = txn->mt_env;
2819 unsigned char *tdbflags = txn->mt_dbflags;
2821 for (i = n; --i >= 2;) {
2822 if (tdbflags[i] & DB_NEW) {
2824 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2826 char *ptr = env->me_dbxs[i].md_name.mv_data;
2828 env->me_dbxs[i].md_name.mv_data = NULL;
2829 env->me_dbxs[i].md_name.mv_size = 0;
2830 env->me_dbflags[i] = 0;
2831 env->me_dbiseqs[i]++;
2837 if (keep && env->me_numdbs < n)
2841 /** Common code for #mdb_txn_reset() and #mdb_txn_abort().
2842 * May be called twice for readonly txns: First reset it, then abort.
2843 * @param[in] txn the transaction handle to reset
2844 * @param[in] act why the transaction is being reset
2847 mdb_txn_reset0(MDB_txn *txn, const char *act)
2849 MDB_env *env = txn->mt_env;
2851 /* Close any DBI handles opened in this txn */
2852 mdb_dbis_update(txn, 0);
2854 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2855 act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2856 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
2858 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2859 if (txn->mt_u.reader) {
2860 txn->mt_u.reader->mr_txnid = (txnid_t)-1;
2861 if (!(env->me_flags & MDB_NOTLS))
2862 txn->mt_u.reader = NULL; /* txn does not own reader */
2864 txn->mt_numdbs = 0; /* close nothing if called again */
2865 txn->mt_dbxs = NULL; /* mark txn as reset */
2867 pgno_t *pghead = env->me_pghead;
2869 mdb_cursors_close(txn, 0);
2870 if (!(env->me_flags & MDB_WRITEMAP)) {
2871 mdb_dlist_free(txn);
2874 if (!txn->mt_parent) {
2875 if (mdb_midl_shrink(&txn->mt_free_pgs))
2876 env->me_free_pgs = txn->mt_free_pgs;
2878 env->me_pghead = NULL;
2882 /* The writer mutex was locked in mdb_txn_begin. */
2884 UNLOCK_MUTEX(MDB_MUTEX(env, w));
2886 txn->mt_parent->mt_child = NULL;
2887 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
2888 mdb_midl_free(txn->mt_free_pgs);
2889 mdb_midl_free(txn->mt_spill_pgs);
2890 free(txn->mt_u.dirty_list);
2893 mdb_midl_free(pghead);
2898 mdb_txn_reset(MDB_txn *txn)
2903 /* This call is only valid for read-only txns */
2904 if (!(txn->mt_flags & MDB_TXN_RDONLY))
2907 mdb_txn_reset0(txn, "reset");
2911 mdb_txn_abort(MDB_txn *txn)
2917 mdb_txn_abort(txn->mt_child);
2919 mdb_txn_reset0(txn, "abort");
2920 /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */
2921 if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
2922 txn->mt_u.reader->mr_pid = 0;
2924 if (txn != txn->mt_env->me_txn0)
2928 /** Save the freelist as of this transaction to the freeDB.
2929 * This changes the freelist. Keep trying until it stabilizes.
2932 mdb_freelist_save(MDB_txn *txn)
2934 /* env->me_pghead[] can grow and shrink during this call.
2935 * env->me_pglast and txn->mt_free_pgs[] can only grow.
2936 * Page numbers cannot disappear from txn->mt_free_pgs[].
2939 MDB_env *env = txn->mt_env;
2940 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
2941 txnid_t pglast = 0, head_id = 0;
2942 pgno_t freecnt = 0, *free_pgs, *mop;
2943 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
2945 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
2947 if (env->me_pghead) {
2948 /* Make sure first page of freeDB is touched and on freelist */
2949 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
2950 if (rc && rc != MDB_NOTFOUND)
2954 if (!env->me_pghead && txn->mt_loose_pgs) {
2955 /* Put loose page numbers in mt_free_pgs, since
2956 * we may be unable to return them to me_pghead.
2958 MDB_page *mp = txn->mt_loose_pgs;
2959 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
2961 for (; mp; mp = NEXT_LOOSE_PAGE(mp))
2962 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2963 txn->mt_loose_pgs = NULL;
2964 txn->mt_loose_count = 0;
2967 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2968 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2969 ? SSIZE_MAX : maxfree_1pg;
2972 /* Come back here after each Put() in case freelist changed */
2977 /* If using records from freeDB which we have not yet
2978 * deleted, delete them and any we reserved for me_pghead.
2980 while (pglast < env->me_pglast) {
2981 rc = mdb_cursor_first(&mc, &key, NULL);
2984 pglast = head_id = *(txnid_t *)key.mv_data;
2985 total_room = head_room = 0;
2986 mdb_tassert(txn, pglast <= env->me_pglast);
2987 rc = mdb_cursor_del(&mc, 0);
2992 /* Save the IDL of pages freed by this txn, to a single record */
2993 if (freecnt < txn->mt_free_pgs[0]) {
2995 /* Make sure last page of freeDB is touched and on freelist */
2996 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
2997 if (rc && rc != MDB_NOTFOUND)
3000 free_pgs = txn->mt_free_pgs;
3001 /* Write to last page of freeDB */
3002 key.mv_size = sizeof(txn->mt_txnid);
3003 key.mv_data = &txn->mt_txnid;
3005 freecnt = free_pgs[0];
3006 data.mv_size = MDB_IDL_SIZEOF(free_pgs);
3007 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3010 /* Retry if mt_free_pgs[] grew during the Put() */
3011 free_pgs = txn->mt_free_pgs;
3012 } while (freecnt < free_pgs[0]);
3013 mdb_midl_sort(free_pgs);
3014 memcpy(data.mv_data, free_pgs, data.mv_size);
3017 unsigned int i = free_pgs[0];
3018 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
3019 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
3021 DPRINTF(("IDL %"Z"u", free_pgs[i]));
3027 mop = env->me_pghead;
3028 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
3030 /* Reserve records for me_pghead[]. Split it if multi-page,
3031 * to avoid searching freeDB for a page range. Use keys in
3032 * range [1,me_pglast]: Smaller than txnid of oldest reader.
3034 if (total_room >= mop_len) {
3035 if (total_room == mop_len || --more < 0)
3037 } else if (head_room >= maxfree_1pg && head_id > 1) {
3038 /* Keep current record (overflow page), add a new one */
3042 /* (Re)write {key = head_id, IDL length = head_room} */
3043 total_room -= head_room;
3044 head_room = mop_len - total_room;
3045 if (head_room > maxfree_1pg && head_id > 1) {
3046 /* Overflow multi-page for part of me_pghead */
3047 head_room /= head_id; /* amortize page sizes */
3048 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
3049 } else if (head_room < 0) {
3050 /* Rare case, not bothering to delete this record */
3053 key.mv_size = sizeof(head_id);
3054 key.mv_data = &head_id;
3055 data.mv_size = (head_room + 1) * sizeof(pgno_t);
3056 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3059 /* IDL is initially empty, zero out at least the length */
3060 pgs = (pgno_t *)data.mv_data;
3061 j = head_room > clean_limit ? head_room : 0;
3065 total_room += head_room;
3068 /* Return loose page numbers to me_pghead, though usually none are
3069 * left at this point. The pages themselves remain in dirty_list.
3071 if (txn->mt_loose_pgs) {
3072 MDB_page *mp = txn->mt_loose_pgs;
3073 unsigned count = txn->mt_loose_count;
3075 /* Room for loose pages + temp IDL with same */
3076 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
3078 mop = env->me_pghead;
3079 loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
3080 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3081 loose[ ++count ] = mp->mp_pgno;
3083 mdb_midl_sort(loose);
3084 mdb_midl_xmerge(mop, loose);
3085 txn->mt_loose_pgs = NULL;
3086 txn->mt_loose_count = 0;
3090 /* Fill in the reserved me_pghead records */
3096 rc = mdb_cursor_first(&mc, &key, &data);
3097 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
3098 txnid_t id = *(txnid_t *)key.mv_data;
3099 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
3102 mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
3104 if (len > mop_len) {
3106 data.mv_size = (len + 1) * sizeof(MDB_ID);
3108 data.mv_data = mop -= len;
3111 rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT);
3113 if (rc || !(mop_len -= len))
3120 /** Flush (some) dirty pages to the map, after clearing their dirty flag.
3121 * @param[in] txn the transaction that's being committed
3122 * @param[in] keep number of initial pages in dirty_list to keep dirty.
3123 * @return 0 on success, non-zero on failure.
3126 mdb_page_flush(MDB_txn *txn, int keep)
3128 MDB_env *env = txn->mt_env;
3129 MDB_ID2L dl = txn->mt_u.dirty_list;
3130 unsigned psize = env->me_psize, j;
3131 int i, pagecount = dl[0].mid, rc;
3132 size_t size = 0, pos = 0;
3134 MDB_page *dp = NULL;
3138 struct iovec iov[MDB_COMMIT_PAGES];
3139 ssize_t wpos = 0, wsize = 0, wres;
3140 size_t next_pos = 1; /* impossible pos, so pos != next_pos */
3146 if (env->me_flags & MDB_WRITEMAP) {
3147 /* Clear dirty flags */
3148 while (++i <= pagecount) {
3150 /* Don't flush this page yet */
3151 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3152 dp->mp_flags &= ~P_KEEP;
3156 dp->mp_flags &= ~P_DIRTY;
3161 /* Write the pages */
3163 if (++i <= pagecount) {
3165 /* Don't flush this page yet */
3166 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3167 dp->mp_flags &= ~P_KEEP;
3172 /* clear dirty flag */
3173 dp->mp_flags &= ~P_DIRTY;
3176 if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3181 /* Windows actually supports scatter/gather I/O, but only on
3182 * unbuffered file handles. Since we're relying on the OS page
3183 * cache for all our data, that's self-defeating. So we just
3184 * write pages one at a time. We use the ov structure to set
3185 * the write offset, to at least save the overhead of a Seek
3188 DPRINTF(("committing page %"Z"u", pgno));
3189 memset(&ov, 0, sizeof(ov));
3190 ov.Offset = pos & 0xffffffff;
3191 ov.OffsetHigh = pos >> 16 >> 16;
3192 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3194 DPRINTF(("WriteFile: %d", rc));
3198 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
3199 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3201 /* Write previous page(s) */
3202 #ifdef MDB_USE_PWRITEV
3203 wres = pwritev(env->me_fd, iov, n, wpos);
3206 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3208 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3210 DPRINTF(("lseek: %s", strerror(rc)));
3213 wres = writev(env->me_fd, iov, n);
3216 if (wres != wsize) {
3219 DPRINTF(("Write error: %s", strerror(rc)));
3221 rc = EIO; /* TODO: Use which error code? */
3222 DPUTS("short write, filesystem full?");
3233 DPRINTF(("committing page %"Z"u", pgno));
3234 next_pos = pos + size;
3235 iov[n].iov_len = size;
3236 iov[n].iov_base = (char *)dp;
3242 /* MIPS has cache coherency issues, this is a no-op everywhere else
3243 * Note: for any size >= on-chip cache size, entire on-chip cache is
3246 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3248 for (i = keep; ++i <= pagecount; ) {
3250 /* This is a page we skipped above */
3253 dl[j].mid = dp->mp_pgno;
3256 mdb_dpage_free(env, dp);
3261 txn->mt_dirty_room += i - j;
3267 mdb_txn_commit(MDB_txn *txn)
3273 if (txn == NULL || txn->mt_env == NULL)
3276 if (txn->mt_child) {
3277 rc = mdb_txn_commit(txn->mt_child);
3278 txn->mt_child = NULL;
3285 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
3286 mdb_dbis_update(txn, 1);
3287 txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */
3292 if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
3293 DPUTS("error flag is set, can't commit");
3295 txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
3300 if (txn->mt_parent) {
3301 MDB_txn *parent = txn->mt_parent;
3305 unsigned x, y, len, ps_len;
3307 /* Append our free list to parent's */
3308 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3311 mdb_midl_free(txn->mt_free_pgs);
3312 /* Failures after this must either undo the changes
3313 * to the parent or set MDB_TXN_ERROR in the parent.
3316 parent->mt_next_pgno = txn->mt_next_pgno;
3317 parent->mt_flags = txn->mt_flags;
3319 /* Merge our cursors into parent's and close them */
3320 mdb_cursors_close(txn, 1);
3322 /* Update parent's DB table. */
3323 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
3324 parent->mt_numdbs = txn->mt_numdbs;
3325 parent->mt_dbflags[0] = txn->mt_dbflags[0];
3326 parent->mt_dbflags[1] = txn->mt_dbflags[1];
3327 for (i=2; i<txn->mt_numdbs; i++) {
3328 /* preserve parent's DB_NEW status */
3329 x = parent->mt_dbflags[i] & DB_NEW;
3330 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3333 dst = parent->mt_u.dirty_list;
3334 src = txn->mt_u.dirty_list;
3335 /* Remove anything in our dirty list from parent's spill list */
3336 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3338 pspill[0] = (pgno_t)-1;
3339 /* Mark our dirty pages as deleted in parent spill list */
3340 for (i=0, len=src[0].mid; ++i <= len; ) {
3341 MDB_ID pn = src[i].mid << 1;
3342 while (pn > pspill[x])
3344 if (pn == pspill[x]) {
3349 /* Squash deleted pagenums if we deleted any */
3350 for (x=y; ++x <= ps_len; )
3351 if (!(pspill[x] & 1))
3352 pspill[++y] = pspill[x];
3356 /* Find len = length of merging our dirty list with parent's */
3358 dst[0].mid = 0; /* simplify loops */
3359 if (parent->mt_parent) {
3360 len = x + src[0].mid;
3361 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
3362 for (i = x; y && i; y--) {
3363 pgno_t yp = src[y].mid;
3364 while (yp < dst[i].mid)
3366 if (yp == dst[i].mid) {
3371 } else { /* Simplify the above for single-ancestor case */
3372 len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
3374 /* Merge our dirty list with parent's */
3376 for (i = len; y; dst[i--] = src[y--]) {
3377 pgno_t yp = src[y].mid;
3378 while (yp < dst[x].mid)
3379 dst[i--] = dst[x--];
3380 if (yp == dst[x].mid)
3381 free(dst[x--].mptr);
3383 mdb_tassert(txn, i == x);
3385 free(txn->mt_u.dirty_list);
3386 parent->mt_dirty_room = txn->mt_dirty_room;
3387 if (txn->mt_spill_pgs) {
3388 if (parent->mt_spill_pgs) {
3389 /* TODO: Prevent failure here, so parent does not fail */
3390 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3392 parent->mt_flags |= MDB_TXN_ERROR;
3393 mdb_midl_free(txn->mt_spill_pgs);
3394 mdb_midl_sort(parent->mt_spill_pgs);
3396 parent->mt_spill_pgs = txn->mt_spill_pgs;
3400 /* Append our loose page list to parent's */
3401 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
3403 *lp = txn->mt_loose_pgs;
3404 parent->mt_loose_count += txn->mt_loose_count;
3406 parent->mt_child = NULL;
3407 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3412 if (txn != env->me_txn) {
3413 DPUTS("attempt to commit unknown transaction");
3418 mdb_cursors_close(txn, 0);
3420 if (!txn->mt_u.dirty_list[0].mid &&
3421 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS)))
3424 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
3425 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3427 /* Update DB root pointers */
3428 if (txn->mt_numdbs > 2) {
3432 data.mv_size = sizeof(MDB_db);
3434 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3435 for (i = 2; i < txn->mt_numdbs; i++) {
3436 if (txn->mt_dbflags[i] & DB_DIRTY) {
3437 if (TXN_DBI_CHANGED(txn, i)) {
3441 data.mv_data = &txn->mt_dbs[i];
3442 rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
3449 rc = mdb_freelist_save(txn);
3453 mdb_midl_free(env->me_pghead);
3454 env->me_pghead = NULL;
3455 if (mdb_midl_shrink(&txn->mt_free_pgs))
3456 env->me_free_pgs = txn->mt_free_pgs;
3463 #ifdef HAVE_FDATASYNC
3464 if (txn->mt_next_pgno * env->me_psize > env->me_size) {
3466 env->me_size = txn->mt_next_pgno * env->me_psize;
3469 if ((rc = mdb_page_flush(txn, 0)) ||
3470 (rc = mdb_env_sync(env, i)) ||
3471 (rc = mdb_env_write_meta(txn)))
3474 /* Free P_LOOSE pages left behind in dirty_list */
3475 if (!(env->me_flags & MDB_WRITEMAP))
3476 mdb_dlist_free(txn);
3481 mdb_dbis_update(txn, 1);
3484 UNLOCK_MUTEX(MDB_MUTEX(env, w));
3485 if (txn != env->me_txn0)
3495 /** Read the environment parameters of a DB environment before
3496 * mapping it into memory.
3497 * @param[in] env the environment handle
3498 * @param[out] meta address of where to store the meta information
3499 * @return 0 on success, non-zero on failure.
3502 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3508 enum { Size = sizeof(pbuf) };
3510 /* We don't know the page size yet, so use a minimum value.
3511 * Read both meta pages so we can use the latest one.
3514 for (i=off=0; i<2; i++, off = meta->mm_psize) {
3518 memset(&ov, 0, sizeof(ov));
3520 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
3521 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
3524 rc = pread(env->me_fd, &pbuf, Size, off);
3527 if (rc == 0 && off == 0)
3529 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
3530 DPRINTF(("read: %s", mdb_strerror(rc)));
3534 p = (MDB_page *)&pbuf;
3536 if (!F_ISSET(p->mp_flags, P_META)) {
3537 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno));
3542 if (m->mm_magic != MDB_MAGIC) {
3543 DPUTS("meta has invalid magic");
3547 if (m->mm_version != MDB_DATA_VERSION) {
3548 DPRINTF(("database is version %u, expected version %u",
3549 m->mm_version, MDB_DATA_VERSION));
3550 return MDB_VERSION_MISMATCH;
3553 if (off == 0 || m->mm_txnid > meta->mm_txnid)
3559 /** Fill in most of the zeroed #MDB_meta for an empty database environment */
3561 mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
3563 meta->mm_magic = MDB_MAGIC;
3564 meta->mm_version = MDB_DATA_VERSION;
3565 meta->mm_mapsize = env->me_mapsize;
3566 meta->mm_psize = env->me_psize;
3567 meta->mm_last_pg = 1;
3568 meta->mm_flags = env->me_flags & 0xffff;
3569 meta->mm_flags |= MDB_INTEGERKEY;
3570 meta->mm_dbs[0].md_root = P_INVALID;
3571 meta->mm_dbs[1].md_root = P_INVALID;
3574 /** Write the environment parameters of a freshly created DB environment.
3575 * @param[in] env the environment handle
3576 * @param[in] meta the #MDB_meta to write
3577 * @return 0 on success, non-zero on failure.
3580 mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3588 memset(&ov, 0, sizeof(ov));
3589 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3591 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
3594 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3595 len = pwrite(fd, ptr, size, pos); \
3596 rc = (len >= 0); } while(0)
3599 DPUTS("writing new meta page");
3601 psize = env->me_psize;
3603 p = calloc(2, psize);
3605 p->mp_flags = P_META;
3606 *(MDB_meta *)METADATA(p) = *meta;
3608 q = (MDB_page *)((char *)p + psize);
3610 q->mp_flags = P_META;
3611 *(MDB_meta *)METADATA(q) = *meta;
3613 DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0);
3616 else if ((unsigned) len == psize * 2)
3624 /** Update the environment info to commit a transaction.
3625 * @param[in] txn the transaction that's being committed
3626 * @return 0 on success, non-zero on failure.
3629 mdb_env_write_meta(MDB_txn *txn)
3632 MDB_meta meta, metab, *mp;
3635 int rc, len, toggle;
3644 toggle = txn->mt_txnid & 1;
3645 DPRINTF(("writing meta page %d for root page %"Z"u",
3646 toggle, txn->mt_dbs[MAIN_DBI].md_root));
3649 mp = env->me_metas[toggle];
3650 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3651 /* Persist any increases of mapsize config */
3652 if (mapsize < env->me_mapsize)
3653 mapsize = env->me_mapsize;
3655 if (env->me_flags & MDB_WRITEMAP) {
3656 mp->mm_mapsize = mapsize;
3657 mp->mm_dbs[0] = txn->mt_dbs[0];
3658 mp->mm_dbs[1] = txn->mt_dbs[1];
3659 mp->mm_last_pg = txn->mt_next_pgno - 1;
3660 #if !(defined(_MSC_VER) || defined(__i386__) || defined(__x86_64__))
3661 /* LY: issue a memory barrier, if not x86. ITS#7969 */
3662 __sync_synchronize();
3664 mp->mm_txnid = txn->mt_txnid;
3665 if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3666 unsigned meta_size = env->me_psize;
3667 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3670 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3671 if (meta_size < env->me_os_psize)
3672 meta_size += meta_size;
3677 if (MDB_MSYNC(ptr, meta_size, rc)) {
3684 metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
3685 metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
3687 meta.mm_mapsize = mapsize;
3688 meta.mm_dbs[0] = txn->mt_dbs[0];
3689 meta.mm_dbs[1] = txn->mt_dbs[1];
3690 meta.mm_last_pg = txn->mt_next_pgno - 1;
3691 meta.mm_txnid = txn->mt_txnid;
3693 off = offsetof(MDB_meta, mm_mapsize);
3694 ptr = (char *)&meta + off;
3695 len = sizeof(MDB_meta) - off;
3697 off += env->me_psize;
3700 /* Write to the SYNC fd */
3701 mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ?
3702 env->me_fd : env->me_mfd;
3705 memset(&ov, 0, sizeof(ov));
3707 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
3711 rc = pwrite(mfd, ptr, len, off);
3714 rc = rc < 0 ? ErrCode() : EIO;
3715 DPUTS("write failed, disk error?");
3716 /* On a failure, the pagecache still contains the new data.
3717 * Write some old data back, to prevent it from being used.
3718 * Use the non-SYNC fd; we know it will fail anyway.
3720 meta.mm_last_pg = metab.mm_last_pg;
3721 meta.mm_txnid = metab.mm_txnid;
3723 memset(&ov, 0, sizeof(ov));
3725 WriteFile(env->me_fd, ptr, len, NULL, &ov);
3727 r2 = pwrite(env->me_fd, ptr, len, off);
3728 (void)r2; /* Silence warnings. We don't care about pwrite's return value */
3731 env->me_flags |= MDB_FATAL_ERROR;
3734 /* MIPS has cache coherency issues, this is a no-op everywhere else */
3735 CACHEFLUSH(env->me_map + off, len, DCACHE);
3737 /* Memory ordering issues are irrelevant; since the entire writer
3738 * is wrapped by wmutex, all of these changes will become visible
3739 * after the wmutex is unlocked. Since the DB is multi-version,
3740 * readers will get consistent data regardless of how fresh or
3741 * how stale their view of these values is.
3744 env->me_txns->mti_txnid = txn->mt_txnid;
3749 /** Check both meta pages to see which one is newer.
3750 * @param[in] env the environment handle
3751 * @return meta toggle (0 or 1).
3754 mdb_env_pick_meta(const MDB_env *env)
3756 return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
3760 mdb_env_create(MDB_env **env)
3764 e = calloc(1, sizeof(MDB_env));
3768 e->me_maxreaders = DEFAULT_READERS;
3769 e->me_maxdbs = e->me_numdbs = 2;
3770 e->me_fd = INVALID_HANDLE_VALUE;
3771 e->me_lfd = INVALID_HANDLE_VALUE;
3772 e->me_mfd = INVALID_HANDLE_VALUE;
3773 #ifdef MDB_USE_SYSV_SEM
3774 e->me_rmutex.semid = -1;
3775 e->me_wmutex.semid = -1;
3777 e->me_pid = getpid();
3778 GET_PAGESIZE(e->me_os_psize);
3779 VGMEMP_CREATE(e,0,0);
3785 mdb_env_map(MDB_env *env, void *addr)
3788 unsigned int flags = env->me_flags;
3792 LONG sizelo, sizehi;
3795 if (flags & MDB_RDONLY) {
3796 /* Don't set explicit map size, use whatever exists */
3801 msize = env->me_mapsize;
3802 sizelo = msize & 0xffffffff;
3803 sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3805 /* Windows won't create mappings for zero length files.
3806 * and won't map more than the file size.
3807 * Just set the maxsize right now.
3809 if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3810 || !SetEndOfFile(env->me_fd)
3811 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)
3815 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
3816 PAGE_READWRITE : PAGE_READONLY,
3817 sizehi, sizelo, NULL);
3820 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
3821 FILE_MAP_WRITE : FILE_MAP_READ,
3823 rc = env->me_map ? 0 : ErrCode();
3828 int prot = PROT_READ;
3829 if (flags & MDB_WRITEMAP) {
3831 if (ftruncate(env->me_fd, env->me_mapsize) < 0)
3834 env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
3836 if (env->me_map == MAP_FAILED) {
3841 if (flags & MDB_NORDAHEAD) {
3842 /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3844 madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3846 #ifdef POSIX_MADV_RANDOM
3847 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3848 #endif /* POSIX_MADV_RANDOM */
3849 #endif /* MADV_RANDOM */
3853 /* Can happen because the address argument to mmap() is just a
3854 * hint. mmap() can pick another, e.g. if the range is in use.
3855 * The MAP_FIXED flag would prevent that, but then mmap could
3856 * instead unmap existing pages to make room for the new map.
3858 if (addr && env->me_map != addr)
3859 return EBUSY; /* TODO: Make a new MDB_* error code? */
3861 p = (MDB_page *)env->me_map;
3862 env->me_metas[0] = METADATA(p);
3863 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize);
3869 mdb_env_set_mapsize(MDB_env *env, size_t size)
3871 /* If env is already open, caller is responsible for making
3872 * sure there are no active txns.
3880 meta = env->me_metas[mdb_env_pick_meta(env)];
3882 size = meta->mm_mapsize;
3884 /* Silently round up to minimum if the size is too small */
3885 size_t minsize = (meta->mm_last_pg + 1) * env->me_psize;
3889 munmap(env->me_map, env->me_mapsize);
3890 env->me_mapsize = size;
3891 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
3892 rc = mdb_env_map(env, old);
3896 env->me_mapsize = size;
3898 env->me_maxpg = env->me_mapsize / env->me_psize;
3903 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3907 env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */
3912 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3914 if (env->me_map || readers < 1)
3916 env->me_maxreaders = readers;
3921 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3923 if (!env || !readers)
3925 *readers = env->me_maxreaders;
3930 mdb_fsize(HANDLE fd, size_t *size)
3933 LARGE_INTEGER fsize;
3935 if (!GetFileSizeEx(fd, &fsize))
3938 *size = fsize.QuadPart;
3950 /** Further setup required for opening an LMDB environment
3953 mdb_env_open2(MDB_env *env)
3955 unsigned int flags = env->me_flags;
3956 int i, newenv = 0, rc;
3960 /* See if we should use QueryLimited */
3962 if ((rc & 0xff) > 5)
3963 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
3965 env->me_pidquery = PROCESS_QUERY_INFORMATION;
3968 if ((i = mdb_env_read_header(env, &meta)) != 0) {
3971 DPUTS("new mdbenv");
3973 env->me_psize = env->me_os_psize;
3974 if (env->me_psize > MAX_PAGESIZE)
3975 env->me_psize = MAX_PAGESIZE;
3976 memset(&meta, 0, sizeof(meta));
3977 mdb_env_init_meta0(env, &meta);
3978 meta.mm_mapsize = DEFAULT_MAPSIZE;
3980 env->me_psize = meta.mm_psize;
3983 /* Was a mapsize configured? */
3984 if (!env->me_mapsize) {
3985 env->me_mapsize = meta.mm_mapsize;
3988 /* Make sure mapsize >= committed data size. Even when using
3989 * mm_mapsize, which could be broken in old files (ITS#7789).
3991 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
3992 if (env->me_mapsize < minsize)
3993 env->me_mapsize = minsize;
3995 meta.mm_mapsize = env->me_mapsize;
3997 if (newenv && !(flags & MDB_FIXEDMAP)) {
3998 /* mdb_env_map() may grow the datafile. Write the metapages
3999 * first, so the file will be valid if initialization fails.
4000 * Except with FIXEDMAP, since we do not yet know mm_address.
4001 * We could fill in mm_address later, but then a different
4002 * program might end up doing that - one with a memory layout
4003 * and map address which does not suit the main program.
4005 rc = mdb_env_init_meta(env, &meta);
4011 rc = mdb_fsize(env->me_fd, &env->me_size);
4015 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
4020 if (flags & MDB_FIXEDMAP)
4021 meta.mm_address = env->me_map;
4022 i = mdb_env_init_meta(env, &meta);
4023 if (i != MDB_SUCCESS) {
4028 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
4029 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2)
4031 #if !(MDB_MAXKEYSIZE)
4032 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
4034 env->me_maxpg = env->me_mapsize / env->me_psize;
4038 int toggle = mdb_env_pick_meta(env);
4039 MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI];
4041 DPRINTF(("opened database version %u, pagesize %u",
4042 env->me_metas[0]->mm_version, env->me_psize));
4043 DPRINTF(("using meta page %d", toggle));
4044 DPRINTF(("depth: %u", db->md_depth));
4045 DPRINTF(("entries: %"Z"u", db->md_entries));
4046 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages));
4047 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages));
4048 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages));
4049 DPRINTF(("root: %"Z"u", db->md_root));
4057 /** Release a reader thread's slot in the reader lock table.
4058 * This function is called automatically when a thread exits.
4059 * @param[in] ptr This points to the slot in the reader lock table.
4062 mdb_env_reader_dest(void *ptr)
4064 MDB_reader *reader = ptr;
4070 /** Junk for arranging thread-specific callbacks on Windows. This is
4071 * necessarily platform and compiler-specific. Windows supports up
4072 * to 1088 keys. Let's assume nobody opens more than 64 environments
4073 * in a single process, for now. They can override this if needed.
4075 #ifndef MAX_TLS_KEYS
4076 #define MAX_TLS_KEYS 64
4078 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
4079 static int mdb_tls_nkeys;
4081 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4085 case DLL_PROCESS_ATTACH: break;
4086 case DLL_THREAD_ATTACH: break;
4087 case DLL_THREAD_DETACH:
4088 for (i=0; i<mdb_tls_nkeys; i++) {
4089 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
4091 mdb_env_reader_dest(r);
4095 case DLL_PROCESS_DETACH: break;
4100 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4102 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4106 /* Force some symbol references.
4107 * _tls_used forces the linker to create the TLS directory if not already done
4108 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
4110 #pragma comment(linker, "/INCLUDE:_tls_used")
4111 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
4112 #pragma const_seg(".CRT$XLB")
4113 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
4114 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4117 #pragma comment(linker, "/INCLUDE:__tls_used")
4118 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
4119 #pragma data_seg(".CRT$XLB")
4120 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4122 #endif /* WIN 32/64 */
4123 #endif /* !__GNUC__ */
4126 /** Downgrade the exclusive lock on the region back to shared */
4128 mdb_env_share_locks(MDB_env *env, int *excl)
4130 int rc = 0, toggle = mdb_env_pick_meta(env);
4132 env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
4137 /* First acquire a shared lock. The Unlock will
4138 * then release the existing exclusive lock.
4140 memset(&ov, 0, sizeof(ov));
4141 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4144 UnlockFile(env->me_lfd, 0, 0, 1, 0);
4150 struct flock lock_info;
4151 /* The shared lock replaces the existing lock */
4152 memset((void *)&lock_info, 0, sizeof(lock_info));
4153 lock_info.l_type = F_RDLCK;
4154 lock_info.l_whence = SEEK_SET;
4155 lock_info.l_start = 0;
4156 lock_info.l_len = 1;
4157 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4158 (rc = ErrCode()) == EINTR) ;
4159 *excl = rc ? -1 : 0; /* error may mean we lost the lock */
4166 /** Try to get exclusive lock, otherwise shared.
4167 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
4170 mdb_env_excl_lock(MDB_env *env, int *excl)
4174 if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
4178 memset(&ov, 0, sizeof(ov));
4179 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4186 struct flock lock_info;
4187 memset((void *)&lock_info, 0, sizeof(lock_info));
4188 lock_info.l_type = F_WRLCK;
4189 lock_info.l_whence = SEEK_SET;
4190 lock_info.l_start = 0;
4191 lock_info.l_len = 1;
4192 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4193 (rc = ErrCode()) == EINTR) ;
4197 # ifdef MDB_USE_SYSV_SEM
4198 if (*excl < 0) /* always true when !MDB_USE_SYSV_SEM */
4201 lock_info.l_type = F_RDLCK;
4202 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
4203 (rc = ErrCode()) == EINTR) ;
4213 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
4215 * @(#) $Revision: 5.1 $
4216 * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
4217 * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
4219 * http://www.isthe.com/chongo/tech/comp/fnv/index.html
4223 * Please do not copyright this code. This code is in the public domain.
4225 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
4226 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
4227 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
4228 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
4229 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
4230 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
4231 * PERFORMANCE OF THIS SOFTWARE.
4234 * chongo <Landon Curt Noll> /\oo/\
4235 * http://www.isthe.com/chongo/
4237 * Share and Enjoy! :-)
4240 typedef unsigned long long mdb_hash_t;
4241 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
4243 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
4244 * @param[in] val value to hash
4245 * @param[in] hval initial value for hash
4246 * @return 64 bit hash
4248 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
4249 * hval arg on the first call.
4252 mdb_hash_val(MDB_val *val, mdb_hash_t hval)
4254 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */
4255 unsigned char *end = s + val->mv_size;
4257 * FNV-1a hash each octet of the string
4260 /* xor the bottom with the current octet */
4261 hval ^= (mdb_hash_t)*s++;
4263 /* multiply by the 64 bit FNV magic prime mod 2^64 */
4264 hval += (hval << 1) + (hval << 4) + (hval << 5) +
4265 (hval << 7) + (hval << 8) + (hval << 40);
4267 /* return our new hash value */
4271 /** Hash the string and output the encoded hash.
4272 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
4273 * very short name limits. We don't care about the encoding being reversible,
4274 * we just want to preserve as many bits of the input as possible in a
4275 * small printable string.
4276 * @param[in] str string to hash
4277 * @param[out] encbuf an array of 11 chars to hold the hash
4279 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
4282 mdb_pack85(unsigned long l, char *out)
4286 for (i=0; i<5; i++) {
4287 *out++ = mdb_a85[l % 85];
4293 mdb_hash_enc(MDB_val *val, char *encbuf)
4295 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
4297 mdb_pack85(h, encbuf);
4298 mdb_pack85(h>>32, encbuf+5);
4303 /** Open and/or initialize the lock region for the environment.
4304 * @param[in] env The LMDB environment.
4305 * @param[in] lpath The pathname of the file used for the lock region.
4306 * @param[in] mode The Unix permissions for the file, if we create it.
4307 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
4308 * @return 0 on success, non-zero on failure.
4311 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
4314 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
4316 # define MDB_ERRCODE_ROFS EROFS
4317 #ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */
4318 # define MDB_CLOEXEC O_CLOEXEC
4321 # define MDB_CLOEXEC 0
4328 env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
4329 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
4330 FILE_ATTRIBUTE_NORMAL, NULL);
4332 env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode);
4334 if (env->me_lfd == INVALID_HANDLE_VALUE) {
4336 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
4341 #if ! ((MDB_CLOEXEC) || defined(_WIN32))
4342 /* Lose record locks when exec*() */
4343 if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
4344 fcntl(env->me_lfd, F_SETFD, fdflags);
4347 if (!(env->me_flags & MDB_NOTLS)) {
4348 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
4351 env->me_flags |= MDB_ENV_TXKEY;
4353 /* Windows TLS callbacks need help finding their TLS info. */
4354 if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
4358 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
4362 /* Try to get exclusive lock. If we succeed, then
4363 * nobody is using the lock region and we should initialize it.
4365 if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
4368 size = GetFileSize(env->me_lfd, NULL);
4370 size = lseek(env->me_lfd, 0, SEEK_END);
4371 if (size == -1) goto fail_errno;
4373 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
4374 if (size < rsize && *excl > 0) {
4376 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
4377 || !SetEndOfFile(env->me_lfd))
4380 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
4384 size = rsize - sizeof(MDB_txninfo);
4385 env->me_maxreaders = size/sizeof(MDB_reader) + 1;
4390 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
4392 if (!mh) goto fail_errno;
4393 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
4395 if (!env->me_txns) goto fail_errno;
4397 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
4399 if (m == MAP_FAILED) goto fail_errno;
4405 BY_HANDLE_FILE_INFORMATION stbuf;
4414 if (!mdb_sec_inited) {
4415 InitializeSecurityDescriptor(&mdb_null_sd,
4416 SECURITY_DESCRIPTOR_REVISION);
4417 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
4418 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
4419 mdb_all_sa.bInheritHandle = FALSE;
4420 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
4423 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
4424 idbuf.volume = stbuf.dwVolumeSerialNumber;
4425 idbuf.nhigh = stbuf.nFileIndexHigh;
4426 idbuf.nlow = stbuf.nFileIndexLow;
4427 val.mv_data = &idbuf;
4428 val.mv_size = sizeof(idbuf);
4429 mdb_hash_enc(&val, encbuf);
4430 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
4431 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
4432 env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
4433 if (!env->me_rmutex) goto fail_errno;
4434 env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
4435 if (!env->me_wmutex) goto fail_errno;
4436 #elif defined(MDB_USE_SYSV_SEM)
4438 unsigned short vals[2] = {1, 1};
4439 int semid = semget(IPC_PRIVATE, 2, mode);
4443 env->me_rmutex.semid = semid;
4444 env->me_wmutex.semid = semid;
4445 env->me_rmutex.semnum = 0;
4446 env->me_wmutex.semnum = 1;
4449 if (semctl(semid, 0, SETALL, semu) < 0)
4451 env->me_txns->mti_semid = semid;
4452 #else /* MDB_USE_SYSV_SEM */
4453 pthread_mutexattr_t mattr;
4455 if ((rc = pthread_mutexattr_init(&mattr))
4456 || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED))
4457 #ifdef MDB_ROBUST_SUPPORTED
4458 || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST))
4460 || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr))
4461 || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
4463 pthread_mutexattr_destroy(&mattr);
4464 #endif /* _WIN32 || MDB_USE_SYSV_SEM */
4466 env->me_txns->mti_magic = MDB_MAGIC;
4467 env->me_txns->mti_format = MDB_LOCK_FORMAT;
4468 env->me_txns->mti_txnid = 0;
4469 env->me_txns->mti_numreaders = 0;
4472 #ifdef MDB_USE_SYSV_SEM
4473 struct semid_ds buf;
4477 if (env->me_txns->mti_magic != MDB_MAGIC) {
4478 DPUTS("lock region has invalid magic");
4482 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
4483 DPRINTF(("lock region has format+version 0x%x, expected 0x%x",
4484 env->me_txns->mti_format, MDB_LOCK_FORMAT));
4485 rc = MDB_VERSION_MISMATCH;
4489 if (rc && rc != EACCES && rc != EAGAIN) {
4493 env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
4494 if (!env->me_rmutex) goto fail_errno;
4495 env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
4496 if (!env->me_wmutex) goto fail_errno;
4497 #elif defined(MDB_USE_SYSV_SEM)
4498 semid = env->me_txns->mti_semid;
4501 /* check for read access */
4502 if (semctl(semid, 0, IPC_STAT, semu) < 0)
4504 /* check for write access */
4505 if (semctl(semid, 0, IPC_SET, semu) < 0)
4508 env->me_rmutex.semid = semid;
4509 env->me_wmutex.semid = semid;
4510 env->me_rmutex.semnum = 0;
4511 env->me_wmutex.semnum = 1;
4522 /** The name of the lock file in the DB environment */
4523 #define LOCKNAME "/lock.mdb"
4524 /** The name of the data file in the DB environment */
4525 #define DATANAME "/data.mdb"
4526 /** The suffix of the lock file when no subdir is used */
4527 #define LOCKSUFF "-lock"
4528 /** Only a subset of the @ref mdb_env flags can be changed
4529 * at runtime. Changing other flags requires closing the
4530 * environment and re-opening it with the new flags.
4532 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
4533 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
4534 MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
4536 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
4537 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4541 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4543 int oflags, rc, len, excl = -1;
4544 char *lpath, *dpath;
4546 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
4550 if (flags & MDB_NOSUBDIR) {
4551 rc = len + sizeof(LOCKSUFF) + len + 1;
4553 rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
4558 if (flags & MDB_NOSUBDIR) {
4559 dpath = lpath + len + sizeof(LOCKSUFF);
4560 sprintf(lpath, "%s" LOCKSUFF, path);
4561 strcpy(dpath, path);
4563 dpath = lpath + len + sizeof(LOCKNAME);
4564 sprintf(lpath, "%s" LOCKNAME, path);
4565 sprintf(dpath, "%s" DATANAME, path);
4569 flags |= env->me_flags;
4570 if (flags & MDB_RDONLY) {
4571 /* silently ignore WRITEMAP when we're only getting read access */
4572 flags &= ~MDB_WRITEMAP;
4574 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
4575 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
4578 env->me_flags = flags |= MDB_ENV_ACTIVE;
4582 env->me_path = strdup(path);
4583 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4584 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4585 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4586 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4591 /* For RDONLY, get lockfile after we know datafile exists */
4592 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
4593 rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4599 if (F_ISSET(flags, MDB_RDONLY)) {
4600 oflags = GENERIC_READ;
4601 len = OPEN_EXISTING;
4603 oflags = GENERIC_READ|GENERIC_WRITE;
4606 mode = FILE_ATTRIBUTE_NORMAL;
4607 env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
4608 NULL, len, mode, NULL);
4610 if (F_ISSET(flags, MDB_RDONLY))
4613 oflags = O_RDWR | O_CREAT;
4615 env->me_fd = open(dpath, oflags, mode);
4617 if (env->me_fd == INVALID_HANDLE_VALUE) {
4622 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
4623 rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4628 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
4629 if (flags & (MDB_RDONLY|MDB_WRITEMAP)) {
4630 env->me_mfd = env->me_fd;
4632 /* Synchronous fd for meta writes. Needed even with
4633 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
4636 len = OPEN_EXISTING;
4637 env->me_mfd = CreateFile(dpath, oflags,
4638 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
4639 mode | FILE_FLAG_WRITE_THROUGH, NULL);
4642 env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
4644 if (env->me_mfd == INVALID_HANDLE_VALUE) {
4649 DPRINTF(("opened dbenv %p", (void *) env));
4651 rc = mdb_env_share_locks(env, &excl);
4655 if (!((flags & MDB_RDONLY) ||
4656 (env->me_pbuf = calloc(1, env->me_psize))))
4658 if (!(flags & MDB_RDONLY)) {
4660 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
4661 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1);
4662 txn = calloc(1, size);
4664 txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
4665 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
4666 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
4667 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
4669 txn->mt_dbxs = env->me_dbxs;
4679 mdb_env_close0(env, excl);
4685 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */
4687 mdb_env_close0(MDB_env *env, int excl)
4691 if (!(env->me_flags & MDB_ENV_ACTIVE))
4694 /* Doing this here since me_dbxs may not exist during mdb_env_close */
4695 for (i = env->me_maxdbs; --i > MAIN_DBI; )
4696 free(env->me_dbxs[i].md_name.mv_data);
4699 free(env->me_dbiseqs);
4700 free(env->me_dbflags);
4703 free(env->me_dirty_list);
4705 mdb_midl_free(env->me_free_pgs);
4707 if (env->me_flags & MDB_ENV_TXKEY) {
4708 pthread_key_delete(env->me_txkey);
4710 /* Delete our key from the global list */
4711 for (i=0; i<mdb_tls_nkeys; i++)
4712 if (mdb_tls_keys[i] == env->me_txkey) {
4713 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
4721 munmap(env->me_map, env->me_mapsize);
4723 if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE)
4724 (void) close(env->me_mfd);
4725 if (env->me_fd != INVALID_HANDLE_VALUE)
4726 (void) close(env->me_fd);
4728 MDB_PID_T pid = env->me_pid;
4729 /* Clearing readers is done in this function because
4730 * me_txkey with its destructor must be disabled first.
4732 for (i = env->me_numreaders; --i >= 0; )
4733 if (env->me_txns->mti_readers[i].mr_pid == pid)
4734 env->me_txns->mti_readers[i].mr_pid = 0;
4736 if (env->me_rmutex) {
4737 CloseHandle(env->me_rmutex);
4738 if (env->me_wmutex) CloseHandle(env->me_wmutex);
4740 /* Windows automatically destroys the mutexes when
4741 * the last handle closes.
4743 #elif defined(MDB_USE_SYSV_SEM)
4744 if (env->me_rmutex.semid != -1) {
4745 /* If we have the filelock: If we are the
4746 * only remaining user, clean up semaphores.
4749 mdb_env_excl_lock(env, &excl);
4751 semctl(env->me_rmutex.semid, 0, IPC_RMID);
4754 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
4756 if (env->me_lfd != INVALID_HANDLE_VALUE) {
4759 /* Unlock the lockfile. Windows would have unlocked it
4760 * after closing anyway, but not necessarily at once.
4762 UnlockFile(env->me_lfd, 0, 0, 1, 0);
4765 (void) close(env->me_lfd);
4768 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
4772 mdb_env_close(MDB_env *env)
4779 VGMEMP_DESTROY(env);
4780 while ((dp = env->me_dpages) != NULL) {
4781 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4782 env->me_dpages = dp->mp_next;
4786 mdb_env_close0(env, 0);
4790 /** Compare two items pointing at aligned size_t's */
4792 mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4794 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4795 *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4798 /** Compare two items pointing at aligned unsigned int's */
4800 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4802 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4803 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4806 /** Compare two items pointing at unsigned ints of unknown alignment.
4807 * Nodes and keys are guaranteed to be 2-byte aligned.
4810 mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
4812 #if BYTE_ORDER == LITTLE_ENDIAN
4813 unsigned short *u, *c;
4816 u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4817 c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
4820 } while(!x && u > (unsigned short *)a->mv_data);
4823 unsigned short *u, *c, *end;
4826 end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4827 u = (unsigned short *)a->mv_data;
4828 c = (unsigned short *)b->mv_data;
4831 } while(!x && u < end);
4836 /** Compare two items pointing at size_t's of unknown alignment. */
4837 #ifdef MISALIGNED_OK
4838 # define mdb_cmp_clong mdb_cmp_long
4840 # define mdb_cmp_clong mdb_cmp_cint
4843 /** Compare two items lexically */
4845 mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
4852 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4858 diff = memcmp(a->mv_data, b->mv_data, len);
4859 return diff ? diff : len_diff<0 ? -1 : len_diff;
4862 /** Compare two items in reverse byte order */
4864 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
4866 const unsigned char *p1, *p2, *p1_lim;
4870 p1_lim = (const unsigned char *)a->mv_data;
4871 p1 = (const unsigned char *)a->mv_data + a->mv_size;
4872 p2 = (const unsigned char *)b->mv_data + b->mv_size;
4874 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4880 while (p1 > p1_lim) {
4881 diff = *--p1 - *--p2;
4885 return len_diff<0 ? -1 : len_diff;
4888 /** Search for key within a page, using binary search.
4889 * Returns the smallest entry larger or equal to the key.
4890 * If exactp is non-null, stores whether the found entry was an exact match
4891 * in *exactp (1 or 0).
4892 * Updates the cursor index with the index of the found entry.
4893 * If no entry larger or equal to the key is found, returns NULL.
4896 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
4898 unsigned int i = 0, nkeys;
4901 MDB_page *mp = mc->mc_pg[mc->mc_top];
4902 MDB_node *node = NULL;
4907 nkeys = NUMKEYS(mp);
4909 DPRINTF(("searching %u keys in %s %spage %"Z"u",
4910 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
4913 low = IS_LEAF(mp) ? 0 : 1;
4915 cmp = mc->mc_dbx->md_cmp;
4917 /* Branch pages have no data, so if using integer keys,
4918 * alignment is guaranteed. Use faster mdb_cmp_int.
4920 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
4921 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
4928 nodekey.mv_size = mc->mc_db->md_pad;
4929 node = NODEPTR(mp, 0); /* fake */
4930 while (low <= high) {
4931 i = (low + high) >> 1;
4932 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
4933 rc = cmp(key, &nodekey);
4934 DPRINTF(("found leaf index %u [%s], rc = %i",
4935 i, DKEY(&nodekey), rc));
4944 while (low <= high) {
4945 i = (low + high) >> 1;
4947 node = NODEPTR(mp, i);
4948 nodekey.mv_size = NODEKSZ(node);
4949 nodekey.mv_data = NODEKEY(node);
4951 rc = cmp(key, &nodekey);
4954 DPRINTF(("found leaf index %u [%s], rc = %i",
4955 i, DKEY(&nodekey), rc));
4957 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i",
4958 i, DKEY(&nodekey), NODEPGNO(node), rc));
4969 if (rc > 0) { /* Found entry is less than the key. */
4970 i++; /* Skip to get the smallest entry larger than key. */
4972 node = NODEPTR(mp, i);
4975 *exactp = (rc == 0 && nkeys > 0);
4976 /* store the key index */
4977 mc->mc_ki[mc->mc_top] = i;
4979 /* There is no entry larger or equal to the key. */
4982 /* nodeptr is fake for LEAF2 */
4988 mdb_cursor_adjust(MDB_cursor *mc, func)
4992 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
4993 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
5000 /** Pop a page off the top of the cursor's stack. */
5002 mdb_cursor_pop(MDB_cursor *mc)
5006 MDB_page *top = mc->mc_pg[mc->mc_top];
5012 DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
5013 DDBI(mc), (void *) mc));
5017 /** Push a page onto the top of the cursor's stack. */
5019 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
5021 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
5022 DDBI(mc), (void *) mc));
5024 if (mc->mc_snum >= CURSOR_STACK) {
5025 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5026 return MDB_CURSOR_FULL;
5029 mc->mc_top = mc->mc_snum++;
5030 mc->mc_pg[mc->mc_top] = mp;
5031 mc->mc_ki[mc->mc_top] = 0;
5036 /** Find the address of the page corresponding to a given page number.
5037 * @param[in] txn the transaction for this access.
5038 * @param[in] pgno the page number for the page to retrieve.
5039 * @param[out] ret address of a pointer where the page's address will be stored.
5040 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page.
5041 * @return 0 on success, non-zero on failure.
5044 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl)
5046 MDB_env *env = txn->mt_env;
5050 if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) {
5054 MDB_ID2L dl = tx2->mt_u.dirty_list;
5056 /* Spilled pages were dirtied in this txn and flushed
5057 * because the dirty list got full. Bring this page
5058 * back in from the map (but don't unspill it here,
5059 * leave that unless page_touch happens again).
5061 if (tx2->mt_spill_pgs) {
5062 MDB_ID pn = pgno << 1;
5063 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
5064 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
5065 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
5070 unsigned x = mdb_mid2l_search(dl, pgno);
5071 if (x <= dl[0].mid && dl[x].mid == pgno) {
5077 } while ((tx2 = tx2->mt_parent) != NULL);
5080 if (pgno < txn->mt_next_pgno) {
5082 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
5084 DPRINTF(("page %"Z"u not found", pgno));
5085 txn->mt_flags |= MDB_TXN_ERROR;
5086 return MDB_PAGE_NOTFOUND;
5096 /** Finish #mdb_page_search() / #mdb_page_search_lowest().
5097 * The cursor is at the root page, set up the rest of it.
5100 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
5102 MDB_page *mp = mc->mc_pg[mc->mc_top];
5106 while (IS_BRANCH(mp)) {
5110 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp)));
5111 mdb_cassert(mc, NUMKEYS(mp) > 1);
5112 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
5114 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
5116 if (flags & MDB_PS_LAST)
5117 i = NUMKEYS(mp) - 1;
5120 node = mdb_node_search(mc, key, &exact);
5122 i = NUMKEYS(mp) - 1;
5124 i = mc->mc_ki[mc->mc_top];
5126 mdb_cassert(mc, i > 0);
5130 DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
5133 mdb_cassert(mc, i < NUMKEYS(mp));
5134 node = NODEPTR(mp, i);
5136 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
5139 mc->mc_ki[mc->mc_top] = i;
5140 if ((rc = mdb_cursor_push(mc, mp)))
5143 if (flags & MDB_PS_MODIFY) {
5144 if ((rc = mdb_page_touch(mc)) != 0)
5146 mp = mc->mc_pg[mc->mc_top];
5151 DPRINTF(("internal error, index points to a %02X page!?",
5153 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5154 return MDB_CORRUPTED;
5157 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
5158 key ? DKEY(key) : "null"));
5159 mc->mc_flags |= C_INITIALIZED;
5160 mc->mc_flags &= ~C_EOF;
5165 /** Search for the lowest key under the current branch page.
5166 * This just bypasses a NUMKEYS check in the current page
5167 * before calling mdb_page_search_root(), because the callers
5168 * are all in situations where the current page is known to
5172 mdb_page_search_lowest(MDB_cursor *mc)
5174 MDB_page *mp = mc->mc_pg[mc->mc_top];
5175 MDB_node *node = NODEPTR(mp, 0);
5178 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
5181 mc->mc_ki[mc->mc_top] = 0;
5182 if ((rc = mdb_cursor_push(mc, mp)))
5184 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
5187 /** Search for the page a given key should be in.
5188 * Push it and its parent pages on the cursor stack.
5189 * @param[in,out] mc the cursor for this operation.
5190 * @param[in] key the key to search for, or NULL for first/last page.
5191 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
5192 * are touched (updated with new page numbers).
5193 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
5194 * This is used by #mdb_cursor_first() and #mdb_cursor_last().
5195 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
5196 * @return 0 on success, non-zero on failure.
5199 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
5204 /* Make sure the txn is still viable, then find the root from
5205 * the txn's db table and set it as the root of the cursor's stack.
5207 if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
5208 DPUTS("transaction has failed, must abort");
5211 /* Make sure we're using an up-to-date root */
5212 if (*mc->mc_dbflag & DB_STALE) {
5214 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5216 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
5217 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
5224 MDB_node *leaf = mdb_node_search(&mc2,
5225 &mc->mc_dbx->md_name, &exact);
5227 return MDB_NOTFOUND;
5228 rc = mdb_node_read(mc->mc_txn, leaf, &data);
5231 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)),
5233 /* The txn may not know this DBI, or another process may
5234 * have dropped and recreated the DB with other flags.
5236 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
5237 return MDB_INCOMPATIBLE;
5238 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
5240 *mc->mc_dbflag &= ~DB_STALE;
5242 root = mc->mc_db->md_root;
5244 if (root == P_INVALID) { /* Tree is empty. */
5245 DPUTS("tree is empty");
5246 return MDB_NOTFOUND;
5250 mdb_cassert(mc, root > 1);
5251 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
5252 if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0)
5258 DPRINTF(("db %d root page %"Z"u has flags 0x%X",
5259 DDBI(mc), root, mc->mc_pg[0]->mp_flags));
5261 if (flags & MDB_PS_MODIFY) {
5262 if ((rc = mdb_page_touch(mc)))
5266 if (flags & MDB_PS_ROOTONLY)
5269 return mdb_page_search_root(mc, key, flags);
5273 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
5275 MDB_txn *txn = mc->mc_txn;
5276 pgno_t pg = mp->mp_pgno;
5277 unsigned x = 0, ovpages = mp->mp_pages;
5278 MDB_env *env = txn->mt_env;
5279 MDB_IDL sl = txn->mt_spill_pgs;
5280 MDB_ID pn = pg << 1;
5283 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages));
5284 /* If the page is dirty or on the spill list we just acquired it,
5285 * so we should give it back to our current free list, if any.
5286 * Otherwise put it onto the list of pages we freed in this txn.
5288 * Won't create me_pghead: me_pglast must be inited along with it.
5289 * Unsupported in nested txns: They would need to hide the page
5290 * range in ancestor txns' dirty and spilled lists.
5292 if (env->me_pghead &&
5294 ((mp->mp_flags & P_DIRTY) ||
5295 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
5299 MDB_ID2 *dl, ix, iy;
5300 rc = mdb_midl_need(&env->me_pghead, ovpages);
5303 if (!(mp->mp_flags & P_DIRTY)) {
5304 /* This page is no longer spilled */
5311 /* Remove from dirty list */
5312 dl = txn->mt_u.dirty_list;
5314 for (ix = dl[x]; ix.mptr != mp; ix = iy) {
5320 mdb_cassert(mc, x > 1);
5322 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */
5323 txn->mt_flags |= MDB_TXN_ERROR;
5324 return MDB_CORRUPTED;
5327 if (!(env->me_flags & MDB_WRITEMAP))
5328 mdb_dpage_free(env, mp);
5330 /* Insert in me_pghead */
5331 mop = env->me_pghead;
5332 j = mop[0] + ovpages;
5333 for (i = mop[0]; i && mop[i] < pg; i--)
5339 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages);
5343 mc->mc_db->md_overflow_pages -= ovpages;
5347 /** Return the data associated with a given node.
5348 * @param[in] txn The transaction for this operation.
5349 * @param[in] leaf The node being read.
5350 * @param[out] data Updated to point to the node's data.
5351 * @return 0 on success, non-zero on failure.
5354 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
5356 MDB_page *omp; /* overflow page */
5360 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5361 data->mv_size = NODEDSZ(leaf);
5362 data->mv_data = NODEDATA(leaf);
5366 /* Read overflow data.
5368 data->mv_size = NODEDSZ(leaf);
5369 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
5370 if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) {
5371 DPRINTF(("read overflow page %"Z"u failed", pgno));
5374 data->mv_data = METADATA(omp);
5380 mdb_get(MDB_txn *txn, MDB_dbi dbi,
5381 MDB_val *key, MDB_val *data)
5388 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
5390 if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
5393 if (txn->mt_flags & MDB_TXN_ERROR)
5396 mdb_cursor_init(&mc, txn, dbi, &mx);
5397 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
5400 /** Find a sibling for a page.
5401 * Replaces the page at the top of the cursor's stack with the
5402 * specified sibling, if one exists.
5403 * @param[in] mc The cursor for this operation.
5404 * @param[in] move_right Non-zero if the right sibling is requested,
5405 * otherwise the left sibling.
5406 * @return 0 on success, non-zero on failure.
5409 mdb_cursor_sibling(MDB_cursor *mc, int move_right)
5415 if (mc->mc_snum < 2) {
5416 return MDB_NOTFOUND; /* root has no siblings */
5420 DPRINTF(("parent page is page %"Z"u, index %u",
5421 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
5423 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
5424 : (mc->mc_ki[mc->mc_top] == 0)) {
5425 DPRINTF(("no more keys left, moving to %s sibling",
5426 move_right ? "right" : "left"));
5427 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
5428 /* undo cursor_pop before returning */
5435 mc->mc_ki[mc->mc_top]++;
5437 mc->mc_ki[mc->mc_top]--;
5438 DPRINTF(("just moving to %s index key %u",
5439 move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
5441 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
5443 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5444 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
5445 /* mc will be inconsistent if caller does mc_snum++ as above */
5446 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
5450 mdb_cursor_push(mc, mp);
5452 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
5457 /** Move the cursor to the next data item. */
5459 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5465 if (mc->mc_flags & C_EOF) {
5466 return MDB_NOTFOUND;
5469 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5471 mp = mc->mc_pg[mc->mc_top];
5473 if (mc->mc_db->md_flags & MDB_DUPSORT) {
5474 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5475 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5476 if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
5477 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
5478 if (op != MDB_NEXT || rc != MDB_NOTFOUND) {
5479 if (rc == MDB_SUCCESS)
5480 MDB_GET_KEY(leaf, key);
5485 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5486 if (op == MDB_NEXT_DUP)
5487 return MDB_NOTFOUND;
5491 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p",
5492 mdb_dbg_pgno(mp), (void *) mc));
5493 if (mc->mc_flags & C_DEL)
5496 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
5497 DPUTS("=====> move to next sibling page");
5498 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) {
5499 mc->mc_flags |= C_EOF;
5502 mp = mc->mc_pg[mc->mc_top];
5503 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5505 mc->mc_ki[mc->mc_top]++;
5508 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5509 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5512 key->mv_size = mc->mc_db->md_pad;
5513 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5517 mdb_cassert(mc, IS_LEAF(mp));
5518 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5520 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5521 mdb_xcursor_init1(mc, leaf);
5524 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5527 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5528 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5529 if (rc != MDB_SUCCESS)
5534 MDB_GET_KEY(leaf, key);
5538 /** Move the cursor to the previous data item. */
5540 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5546 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5548 mp = mc->mc_pg[mc->mc_top];
5550 if (mc->mc_db->md_flags & MDB_DUPSORT) {
5551 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5552 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5553 if (op == MDB_PREV || op == MDB_PREV_DUP) {
5554 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5555 if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5556 if (rc == MDB_SUCCESS) {
5557 MDB_GET_KEY(leaf, key);
5558 mc->mc_flags &= ~C_EOF;
5564 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5565 if (op == MDB_PREV_DUP)
5566 return MDB_NOTFOUND;
5570 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p",
5571 mdb_dbg_pgno(mp), (void *) mc));
5573 if (mc->mc_ki[mc->mc_top] == 0) {
5574 DPUTS("=====> move to prev sibling page");
5575 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) {
5578 mp = mc->mc_pg[mc->mc_top];
5579 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
5580 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5582 mc->mc_ki[mc->mc_top]--;
5584 mc->mc_flags &= ~C_EOF;
5586 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5587 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5590 key->mv_size = mc->mc_db->md_pad;
5591 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5595 mdb_cassert(mc, IS_LEAF(mp));
5596 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5598 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5599 mdb_xcursor_init1(mc, leaf);
5602 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5605 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5606 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5607 if (rc != MDB_SUCCESS)
5612 MDB_GET_KEY(leaf, key);
5616 /** Set the cursor on a specific data item. */
5618 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5619 MDB_cursor_op op, int *exactp)
5623 MDB_node *leaf = NULL;
5626 if (key->mv_size == 0)
5627 return MDB_BAD_VALSIZE;
5630 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5632 /* See if we're already on the right page */
5633 if (mc->mc_flags & C_INITIALIZED) {
5636 mp = mc->mc_pg[mc->mc_top];
5638 mc->mc_ki[mc->mc_top] = 0;
5639 return MDB_NOTFOUND;
5641 if (mp->mp_flags & P_LEAF2) {
5642 nodekey.mv_size = mc->mc_db->md_pad;
5643 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
5645 leaf = NODEPTR(mp, 0);
5646 MDB_GET_KEY2(leaf, nodekey);
5648 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5650 /* Probably happens rarely, but first node on the page
5651 * was the one we wanted.
5653 mc->mc_ki[mc->mc_top] = 0;
5660 unsigned int nkeys = NUMKEYS(mp);
5662 if (mp->mp_flags & P_LEAF2) {
5663 nodekey.mv_data = LEAF2KEY(mp,
5664 nkeys-1, nodekey.mv_size);
5666 leaf = NODEPTR(mp, nkeys-1);
5667 MDB_GET_KEY2(leaf, nodekey);
5669 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5671 /* last node was the one we wanted */
5672 mc->mc_ki[mc->mc_top] = nkeys-1;
5678 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
5679 /* This is definitely the right page, skip search_page */
5680 if (mp->mp_flags & P_LEAF2) {
5681 nodekey.mv_data = LEAF2KEY(mp,
5682 mc->mc_ki[mc->mc_top], nodekey.mv_size);
5684 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5685 MDB_GET_KEY2(leaf, nodekey);
5687 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5689 /* current node was the one we wanted */
5699 /* If any parents have right-sibs, search.
5700 * Otherwise, there's nothing further.
5702 for (i=0; i<mc->mc_top; i++)
5704 NUMKEYS(mc->mc_pg[i])-1)
5706 if (i == mc->mc_top) {
5707 /* There are no other pages */
5708 mc->mc_ki[mc->mc_top] = nkeys;
5709 return MDB_NOTFOUND;
5713 /* There are no other pages */
5714 mc->mc_ki[mc->mc_top] = 0;
5715 if (op == MDB_SET_RANGE && !exactp) {
5719 return MDB_NOTFOUND;
5723 rc = mdb_page_search(mc, key, 0);
5724 if (rc != MDB_SUCCESS)
5727 mp = mc->mc_pg[mc->mc_top];
5728 mdb_cassert(mc, IS_LEAF(mp));
5731 leaf = mdb_node_search(mc, key, exactp);
5732 if (exactp != NULL && !*exactp) {
5733 /* MDB_SET specified and not an exact match. */
5734 return MDB_NOTFOUND;
5738 DPUTS("===> inexact leaf not found, goto sibling");
5739 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
5740 return rc; /* no entries matched */
5741 mp = mc->mc_pg[mc->mc_top];
5742 mdb_cassert(mc, IS_LEAF(mp));
5743 leaf = NODEPTR(mp, 0);
5747 mc->mc_flags |= C_INITIALIZED;
5748 mc->mc_flags &= ~C_EOF;
5751 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) {
5752 key->mv_size = mc->mc_db->md_pad;
5753 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5758 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5759 mdb_xcursor_init1(mc, leaf);
5762 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5763 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) {
5764 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5767 if (op == MDB_GET_BOTH) {
5773 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
5774 if (rc != MDB_SUCCESS)
5777 } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
5779 if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
5781 rc = mc->mc_dbx->md_dcmp(data, &d2);
5783 if (op == MDB_GET_BOTH || rc > 0)
5784 return MDB_NOTFOUND;
5791 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5792 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5797 /* The key already matches in all other cases */
5798 if (op == MDB_SET_RANGE || op == MDB_SET_KEY)
5799 MDB_GET_KEY(leaf, key);
5800 DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
5805 /** Move the cursor to the first item in the database. */
5807 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5813 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5815 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5816 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
5817 if (rc != MDB_SUCCESS)
5820 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5822 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
5823 mc->mc_flags |= C_INITIALIZED;
5824 mc->mc_flags &= ~C_EOF;
5826 mc->mc_ki[mc->mc_top] = 0;
5828 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5829 key->mv_size = mc->mc_db->md_pad;
5830 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
5835 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5836 mdb_xcursor_init1(mc, leaf);
5837 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5841 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5845 MDB_GET_KEY(leaf, key);
5849 /** Move the cursor to the last item in the database. */
5851 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5857 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5859 if (!(mc->mc_flags & C_EOF)) {
5861 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5862 rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
5863 if (rc != MDB_SUCCESS)
5866 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5869 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
5870 mc->mc_flags |= C_INITIALIZED|C_EOF;
5871 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5873 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5874 key->mv_size = mc->mc_db->md_pad;
5875 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
5880 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5881 mdb_xcursor_init1(mc, leaf);
5882 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5886 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5891 MDB_GET_KEY(leaf, key);
5896 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5901 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data);
5906 if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
5910 case MDB_GET_CURRENT:
5911 if (!(mc->mc_flags & C_INITIALIZED)) {
5914 MDB_page *mp = mc->mc_pg[mc->mc_top];
5915 int nkeys = NUMKEYS(mp);
5916 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5917 mc->mc_ki[mc->mc_top] = nkeys;
5923 key->mv_size = mc->mc_db->md_pad;
5924 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5926 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5927 MDB_GET_KEY(leaf, key);
5929 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5930 if (mc->mc_flags & C_DEL)
5931 mdb_xcursor_init1(mc, leaf);
5932 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
5934 rc = mdb_node_read(mc->mc_txn, leaf, data);
5941 case MDB_GET_BOTH_RANGE:
5946 if (mc->mc_xcursor == NULL) {
5947 rc = MDB_INCOMPATIBLE;
5957 rc = mdb_cursor_set(mc, key, data, op,
5958 op == MDB_SET_RANGE ? NULL : &exact);
5961 case MDB_GET_MULTIPLE:
5962 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
5966 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5967 rc = MDB_INCOMPATIBLE;
5971 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
5972 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
5975 case MDB_NEXT_MULTIPLE:
5980 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5981 rc = MDB_INCOMPATIBLE;
5984 if (!(mc->mc_flags & C_INITIALIZED))
5985 rc = mdb_cursor_first(mc, key, data);
5987 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
5988 if (rc == MDB_SUCCESS) {
5989 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
5992 mx = &mc->mc_xcursor->mx_cursor;
5993 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
5995 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
5996 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
6004 case MDB_NEXT_NODUP:
6005 if (!(mc->mc_flags & C_INITIALIZED))
6006 rc = mdb_cursor_first(mc, key, data);
6008 rc = mdb_cursor_next(mc, key, data, op);
6012 case MDB_PREV_NODUP:
6013 if (!(mc->mc_flags & C_INITIALIZED)) {
6014 rc = mdb_cursor_last(mc, key, data);
6017 mc->mc_flags |= C_INITIALIZED;
6018 mc->mc_ki[mc->mc_top]++;
6020 rc = mdb_cursor_prev(mc, key, data, op);
6023 rc = mdb_cursor_first(mc, key, data);
6026 mfunc = mdb_cursor_first;
6028 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
6032 if (mc->mc_xcursor == NULL) {
6033 rc = MDB_INCOMPATIBLE;
6037 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6038 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6039 MDB_GET_KEY(leaf, key);
6040 rc = mdb_node_read(mc->mc_txn, leaf, data);
6044 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
6048 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
6051 rc = mdb_cursor_last(mc, key, data);
6054 mfunc = mdb_cursor_last;
6057 DPRINTF(("unhandled/unimplemented cursor operation %u", op));
6062 if (mc->mc_flags & C_DEL)
6063 mc->mc_flags ^= C_DEL;
6068 /** Touch all the pages in the cursor stack. Set mc_top.
6069 * Makes sure all the pages are writable, before attempting a write operation.
6070 * @param[in] mc The cursor to operate on.
6073 mdb_cursor_touch(MDB_cursor *mc)
6075 int rc = MDB_SUCCESS;
6077 if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
6080 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
6082 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
6083 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
6086 *mc->mc_dbflag |= DB_DIRTY;
6091 rc = mdb_page_touch(mc);
6092 } while (!rc && ++(mc->mc_top) < mc->mc_snum);
6093 mc->mc_top = mc->mc_snum-1;
6098 /** Do not spill pages to disk if txn is getting full, may fail instead */
6099 #define MDB_NOSPILL 0x8000
6102 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
6105 enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
6107 MDB_node *leaf = NULL;
6110 MDB_val xdata, *rdata, dkey, olddata;
6112 int do_sub = 0, insert_key, insert_data;
6113 unsigned int mcount = 0, dcount = 0, nospill;
6116 unsigned int nflags;
6119 if (mc == NULL || key == NULL)
6122 env = mc->mc_txn->mt_env;
6124 /* Check this first so counter will always be zero on any
6127 if (flags & MDB_MULTIPLE) {
6128 dcount = data[1].mv_size;
6129 data[1].mv_size = 0;
6130 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))
6131 return MDB_INCOMPATIBLE;
6134 nospill = flags & MDB_NOSPILL;
6135 flags &= ~MDB_NOSPILL;
6137 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
6138 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6140 if (key->mv_size-1 >= ENV_MAXKEY(env))
6141 return MDB_BAD_VALSIZE;
6143 #if SIZE_MAX > MAXDATASIZE
6144 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
6145 return MDB_BAD_VALSIZE;
6147 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
6148 return MDB_BAD_VALSIZE;
6151 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
6152 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
6156 if (flags == MDB_CURRENT) {
6157 if (!(mc->mc_flags & C_INITIALIZED))
6160 } else if (mc->mc_db->md_root == P_INVALID) {
6161 /* new database, cursor has nothing to point to */
6164 mc->mc_flags &= ~C_INITIALIZED;
6169 if (flags & MDB_APPEND) {
6171 rc = mdb_cursor_last(mc, &k2, &d2);
6173 rc = mc->mc_dbx->md_cmp(key, &k2);
6176 mc->mc_ki[mc->mc_top]++;
6178 /* new key is <= last key */
6183 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
6185 if ((flags & MDB_NOOVERWRITE) && rc == 0) {
6186 DPRINTF(("duplicate key [%s]", DKEY(key)));
6188 return MDB_KEYEXIST;
6190 if (rc && rc != MDB_NOTFOUND)
6194 if (mc->mc_flags & C_DEL)
6195 mc->mc_flags ^= C_DEL;
6197 /* Cursor is positioned, check for room in the dirty list */
6199 if (flags & MDB_MULTIPLE) {
6201 xdata.mv_size = data->mv_size * dcount;
6205 if ((rc2 = mdb_page_spill(mc, key, rdata)))
6209 if (rc == MDB_NO_ROOT) {
6211 /* new database, write a root leaf page */
6212 DPUTS("allocating new root leaf page");
6213 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) {
6216 mdb_cursor_push(mc, np);
6217 mc->mc_db->md_root = np->mp_pgno;
6218 mc->mc_db->md_depth++;
6219 *mc->mc_dbflag |= DB_DIRTY;
6220 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
6222 np->mp_flags |= P_LEAF2;
6223 mc->mc_flags |= C_INITIALIZED;
6225 /* make sure all cursor pages are writable */
6226 rc2 = mdb_cursor_touch(mc);
6231 insert_key = insert_data = rc;
6233 /* The key does not exist */
6234 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
6235 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
6236 LEAFSIZE(key, data) > env->me_nodemax)
6238 /* Too big for a node, insert in sub-DB. Set up an empty
6239 * "old sub-page" for prep_subDB to expand to a full page.
6241 fp_flags = P_LEAF|P_DIRTY;
6243 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
6244 fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
6245 olddata.mv_size = PAGEHDRSZ;
6249 /* there's only a key anyway, so this is a no-op */
6250 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6252 unsigned int ksize = mc->mc_db->md_pad;
6253 if (key->mv_size != ksize)
6254 return MDB_BAD_VALSIZE;
6255 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
6256 memcpy(ptr, key->mv_data, ksize);
6258 /* if overwriting slot 0 of leaf, need to
6259 * update branch key if there is a parent page
6261 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6262 unsigned short top = mc->mc_top;
6264 /* slot 0 is always an empty key, find real slot */
6265 while (mc->mc_top && !mc->mc_ki[mc->mc_top])
6267 if (mc->mc_ki[mc->mc_top])
6268 rc2 = mdb_update_key(mc, key);
6279 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6280 olddata.mv_size = NODEDSZ(leaf);
6281 olddata.mv_data = NODEDATA(leaf);
6284 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
6285 /* Prepare (sub-)page/sub-DB to accept the new item,
6286 * if needed. fp: old sub-page or a header faking
6287 * it. mp: new (sub-)page. offset: growth in page
6288 * size. xdata: node data with new page or DB.
6290 unsigned i, offset = 0;
6291 mp = fp = xdata.mv_data = env->me_pbuf;
6292 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
6294 /* Was a single item before, must convert now */
6295 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6296 /* Just overwrite the current item */
6297 if (flags == MDB_CURRENT)
6300 #if UINT_MAX < SIZE_MAX
6301 if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
6302 mc->mc_dbx->md_dcmp = mdb_cmp_clong;
6304 /* does data match? */
6305 if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
6306 if (flags & MDB_NODUPDATA)
6307 return MDB_KEYEXIST;
6312 /* Back up original data item */
6313 dkey.mv_size = olddata.mv_size;
6314 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
6316 /* Make sub-page header for the dup items, with dummy body */
6317 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
6318 fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
6319 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
6320 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6321 fp->mp_flags |= P_LEAF2;
6322 fp->mp_pad = data->mv_size;
6323 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
6325 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
6326 (dkey.mv_size & 1) + (data->mv_size & 1);
6328 fp->mp_upper = xdata.mv_size - PAGEBASE;
6329 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
6330 } else if (leaf->mn_flags & F_SUBDATA) {
6331 /* Data is on sub-DB, just store it */
6332 flags |= F_DUPDATA|F_SUBDATA;
6335 /* Data is on sub-page */
6336 fp = olddata.mv_data;
6339 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6340 offset = EVEN(NODESIZE + sizeof(indx_t) +
6344 offset = fp->mp_pad;
6345 if (SIZELEFT(fp) < offset) {
6346 offset *= 4; /* space for 4 more */
6349 /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
6351 fp->mp_flags |= P_DIRTY;
6352 COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
6353 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
6357 xdata.mv_size = olddata.mv_size + offset;
6360 fp_flags = fp->mp_flags;
6361 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
6362 /* Too big for a sub-page, convert to sub-DB */
6363 fp_flags &= ~P_SUBP;
6365 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6366 fp_flags |= P_LEAF2;
6367 dummy.md_pad = fp->mp_pad;
6368 dummy.md_flags = MDB_DUPFIXED;
6369 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
6370 dummy.md_flags |= MDB_INTEGERKEY;
6376 dummy.md_branch_pages = 0;
6377 dummy.md_leaf_pages = 1;
6378 dummy.md_overflow_pages = 0;
6379 dummy.md_entries = NUMKEYS(fp);
6380 xdata.mv_size = sizeof(MDB_db);
6381 xdata.mv_data = &dummy;
6382 if ((rc = mdb_page_alloc(mc, 1, &mp)))
6384 offset = env->me_psize - olddata.mv_size;
6385 flags |= F_DUPDATA|F_SUBDATA;
6386 dummy.md_root = mp->mp_pgno;
6389 mp->mp_flags = fp_flags | P_DIRTY;
6390 mp->mp_pad = fp->mp_pad;
6391 mp->mp_lower = fp->mp_lower;
6392 mp->mp_upper = fp->mp_upper + offset;
6393 if (fp_flags & P_LEAF2) {
6394 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6396 memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
6397 olddata.mv_size - fp->mp_upper - PAGEBASE);
6398 for (i=0; i<NUMKEYS(fp); i++)
6399 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
6407 mdb_node_del(mc, 0);
6411 /* overflow page overwrites need special handling */
6412 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6415 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
6417 memcpy(&pg, olddata.mv_data, sizeof(pg));
6418 if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
6420 ovpages = omp->mp_pages;
6422 /* Is the ov page large enough? */
6423 if (ovpages >= dpages) {
6424 if (!(omp->mp_flags & P_DIRTY) &&
6425 (level || (env->me_flags & MDB_WRITEMAP)))
6427 rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
6430 level = 0; /* dirty in this txn or clean */
6433 if (omp->mp_flags & P_DIRTY) {
6434 /* yes, overwrite it. Note in this case we don't
6435 * bother to try shrinking the page if the new data
6436 * is smaller than the overflow threshold.
6439 /* It is writable only in a parent txn */
6440 size_t sz = (size_t) env->me_psize * ovpages, off;
6441 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
6447 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
6448 mdb_cassert(mc, rc2 == 0);
6449 if (!(flags & MDB_RESERVE)) {
6450 /* Copy end of page, adjusting alignment so
6451 * compiler may copy words instead of bytes.
6453 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
6454 memcpy((size_t *)((char *)np + off),
6455 (size_t *)((char *)omp + off), sz - off);
6458 memcpy(np, omp, sz); /* Copy beginning of page */
6461 SETDSZ(leaf, data->mv_size);
6462 if (F_ISSET(flags, MDB_RESERVE))
6463 data->mv_data = METADATA(omp);
6465 memcpy(METADATA(omp), data->mv_data, data->mv_size);
6469 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
6471 } else if (data->mv_size == olddata.mv_size) {
6472 /* same size, just replace it. Note that we could
6473 * also reuse this node if the new data is smaller,
6474 * but instead we opt to shrink the node in that case.
6476 if (F_ISSET(flags, MDB_RESERVE))
6477 data->mv_data = olddata.mv_data;
6478 else if (!(mc->mc_flags & C_SUB))
6479 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6481 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6486 mdb_node_del(mc, 0);
6492 nflags = flags & NODE_ADD_FLAGS;
6493 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
6494 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
6495 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
6496 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */
6498 nflags |= MDB_SPLIT_REPLACE;
6499 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
6501 /* There is room already in this leaf page. */
6502 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
6503 if (rc == 0 && insert_key) {
6504 /* Adjust other cursors pointing to mp */
6505 MDB_cursor *m2, *m3;
6506 MDB_dbi dbi = mc->mc_dbi;
6507 unsigned i = mc->mc_top;
6508 MDB_page *mp = mc->mc_pg[i];
6510 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6511 if (mc->mc_flags & C_SUB)
6512 m3 = &m2->mc_xcursor->mx_cursor;
6515 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
6516 if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
6523 if (rc == MDB_SUCCESS) {
6524 /* Now store the actual data in the child DB. Note that we're
6525 * storing the user data in the keys field, so there are strict
6526 * size limits on dupdata. The actual data fields of the child
6527 * DB are all zero size.
6535 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6536 if (flags & MDB_CURRENT) {
6537 xflags = MDB_CURRENT|MDB_NOSPILL;
6539 mdb_xcursor_init1(mc, leaf);
6540 xflags = (flags & MDB_NODUPDATA) ?
6541 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL;
6543 /* converted, write the original data first */
6545 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
6549 /* Adjust other cursors pointing to mp */
6551 unsigned i = mc->mc_top;
6552 MDB_page *mp = mc->mc_pg[i];
6554 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6555 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6556 if (!(m2->mc_flags & C_INITIALIZED)) continue;
6557 if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
6558 mdb_xcursor_init1(m2, leaf);
6562 /* we've done our job */
6565 ecount = mc->mc_xcursor->mx_db.md_entries;
6566 if (flags & MDB_APPENDDUP)
6567 xflags |= MDB_APPEND;
6568 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
6569 if (flags & F_SUBDATA) {
6570 void *db = NODEDATA(leaf);
6571 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6573 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
6575 /* Increment count unless we just replaced an existing item. */
6577 mc->mc_db->md_entries++;
6579 /* Invalidate txn if we created an empty sub-DB */
6582 /* If we succeeded and the key didn't exist before,
6583 * make sure the cursor is marked valid.
6585 mc->mc_flags |= C_INITIALIZED;
6587 if (flags & MDB_MULTIPLE) {
6590 /* let caller know how many succeeded, if any */
6591 data[1].mv_size = mcount;
6592 if (mcount < dcount) {
6593 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6594 insert_key = insert_data = 0;
6601 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */
6604 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6609 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6615 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
6616 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6618 if (!(mc->mc_flags & C_INITIALIZED))
6621 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6622 return MDB_NOTFOUND;
6624 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
6627 rc = mdb_cursor_touch(mc);
6631 mp = mc->mc_pg[mc->mc_top];
6634 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6636 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6637 if (flags & MDB_NODUPDATA) {
6638 /* mdb_cursor_del0() will subtract the final entry */
6639 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
6641 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
6642 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6644 rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL);
6647 /* If sub-DB still has entries, we're done */
6648 if (mc->mc_xcursor->mx_db.md_entries) {
6649 if (leaf->mn_flags & F_SUBDATA) {
6650 /* update subDB info */
6651 void *db = NODEDATA(leaf);
6652 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6655 /* shrink fake page */
6656 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6657 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6658 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6659 /* fix other sub-DB cursors pointed at this fake page */
6660 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6661 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6662 if (m2->mc_pg[mc->mc_top] == mp &&
6663 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
6664 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6667 mc->mc_db->md_entries--;
6668 mc->mc_flags |= C_DEL;
6671 /* otherwise fall thru and delete the sub-DB */
6674 if (leaf->mn_flags & F_SUBDATA) {
6675 /* add all the child DB's pages to the free list */
6676 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
6682 /* add overflow pages to free list */
6683 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6687 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
6688 if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) ||
6689 (rc = mdb_ovpage_free(mc, omp)))
6694 return mdb_cursor_del0(mc);
6697 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6701 /** Allocate and initialize new pages for a database.
6702 * @param[in] mc a cursor on the database being added to.
6703 * @param[in] flags flags defining what type of page is being allocated.
6704 * @param[in] num the number of pages to allocate. This is usually 1,
6705 * unless allocating overflow pages for a large record.
6706 * @param[out] mp Address of a page, or NULL on failure.
6707 * @return 0 on success, non-zero on failure.
6710 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
6715 if ((rc = mdb_page_alloc(mc, num, &np)))
6717 DPRINTF(("allocated new mpage %"Z"u, page size %u",
6718 np->mp_pgno, mc->mc_txn->mt_env->me_psize));
6719 np->mp_flags = flags | P_DIRTY;
6720 np->mp_lower = (PAGEHDRSZ-PAGEBASE);
6721 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
6724 mc->mc_db->md_branch_pages++;
6725 else if (IS_LEAF(np))
6726 mc->mc_db->md_leaf_pages++;
6727 else if (IS_OVERFLOW(np)) {
6728 mc->mc_db->md_overflow_pages += num;
6736 /** Calculate the size of a leaf node.
6737 * The size depends on the environment's page size; if a data item
6738 * is too large it will be put onto an overflow page and the node
6739 * size will only include the key and not the data. Sizes are always
6740 * rounded up to an even number of bytes, to guarantee 2-byte alignment
6741 * of the #MDB_node headers.
6742 * @param[in] env The environment handle.
6743 * @param[in] key The key for the node.
6744 * @param[in] data The data for the node.
6745 * @return The number of bytes needed to store the node.
6748 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
6752 sz = LEAFSIZE(key, data);
6753 if (sz > env->me_nodemax) {
6754 /* put on overflow page */
6755 sz -= data->mv_size - sizeof(pgno_t);
6758 return EVEN(sz + sizeof(indx_t));
6761 /** Calculate the size of a branch node.
6762 * The size should depend on the environment's page size but since
6763 * we currently don't support spilling large keys onto overflow
6764 * pages, it's simply the size of the #MDB_node header plus the
6765 * size of the key. Sizes are always rounded up to an even number
6766 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
6767 * @param[in] env The environment handle.
6768 * @param[in] key The key for the node.
6769 * @return The number of bytes needed to store the node.
6772 mdb_branch_size(MDB_env *env, MDB_val *key)
6777 if (sz > env->me_nodemax) {
6778 /* put on overflow page */
6779 /* not implemented */
6780 /* sz -= key->size - sizeof(pgno_t); */
6783 return sz + sizeof(indx_t);
6786 /** Add a node to the page pointed to by the cursor.
6787 * @param[in] mc The cursor for this operation.
6788 * @param[in] indx The index on the page where the new node should be added.
6789 * @param[in] key The key for the new node.
6790 * @param[in] data The data for the new node, if any.
6791 * @param[in] pgno The page number, if adding a branch node.
6792 * @param[in] flags Flags for the node.
6793 * @return 0 on success, non-zero on failure. Possible errors are:
6795 * <li>ENOMEM - failed to allocate overflow pages for the node.
6796 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error
6797 * should never happen since all callers already calculate the
6798 * page's free space before calling this function.
6802 mdb_node_add(MDB_cursor *mc, indx_t indx,
6803 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
6806 size_t node_size = NODESIZE;
6810 MDB_page *mp = mc->mc_pg[mc->mc_top];
6811 MDB_page *ofp = NULL; /* overflow page */
6814 mdb_cassert(mc, mp->mp_upper >= mp->mp_lower);
6816 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]",
6817 IS_LEAF(mp) ? "leaf" : "branch",
6818 IS_SUBP(mp) ? "sub-" : "",
6819 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
6820 key ? key->mv_size : 0, key ? DKEY(key) : "null"));
6823 /* Move higher keys up one slot. */
6824 int ksize = mc->mc_db->md_pad, dif;
6825 char *ptr = LEAF2KEY(mp, indx, ksize);
6826 dif = NUMKEYS(mp) - indx;
6828 memmove(ptr+ksize, ptr, dif*ksize);
6829 /* insert new key */
6830 memcpy(ptr, key->mv_data, ksize);
6832 /* Just using these for counting */
6833 mp->mp_lower += sizeof(indx_t);
6834 mp->mp_upper -= ksize - sizeof(indx_t);
6838 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
6840 node_size += key->mv_size;
6842 mdb_cassert(mc, data);
6843 if (F_ISSET(flags, F_BIGDATA)) {
6844 /* Data already on overflow page. */
6845 node_size += sizeof(pgno_t);
6846 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
6847 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
6849 /* Put data on overflow page. */
6850 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
6851 data->mv_size, node_size+data->mv_size));
6852 node_size = EVEN(node_size + sizeof(pgno_t));
6853 if ((ssize_t)node_size > room)
6855 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
6857 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
6861 node_size += data->mv_size;
6864 node_size = EVEN(node_size);
6865 if ((ssize_t)node_size > room)
6869 /* Move higher pointers up one slot. */
6870 for (i = NUMKEYS(mp); i > indx; i--)
6871 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
6873 /* Adjust free space offsets. */
6874 ofs = mp->mp_upper - node_size;
6875 mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
6876 mp->mp_ptrs[indx] = ofs;
6878 mp->mp_lower += sizeof(indx_t);
6880 /* Write the node data. */
6881 node = NODEPTR(mp, indx);
6882 node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
6883 node->mn_flags = flags;
6885 SETDSZ(node,data->mv_size);
6890 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
6893 mdb_cassert(mc, key);
6895 if (F_ISSET(flags, F_BIGDATA))
6896 memcpy(node->mn_data + key->mv_size, data->mv_data,
6898 else if (F_ISSET(flags, MDB_RESERVE))
6899 data->mv_data = node->mn_data + key->mv_size;
6901 memcpy(node->mn_data + key->mv_size, data->mv_data,
6904 memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
6906 if (F_ISSET(flags, MDB_RESERVE))
6907 data->mv_data = METADATA(ofp);
6909 memcpy(METADATA(ofp), data->mv_data, data->mv_size);
6916 DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6917 mdb_dbg_pgno(mp), NUMKEYS(mp)));
6918 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
6919 DPRINTF(("node size = %"Z"u", node_size));
6920 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6921 return MDB_PAGE_FULL;
6924 /** Delete the specified node from a page.
6925 * @param[in] mc Cursor pointing to the node to delete.
6926 * @param[in] ksize The size of a node. Only used if the page is
6927 * part of a #MDB_DUPFIXED database.
6930 mdb_node_del(MDB_cursor *mc, int ksize)
6932 MDB_page *mp = mc->mc_pg[mc->mc_top];
6933 indx_t indx = mc->mc_ki[mc->mc_top];
6935 indx_t i, j, numkeys, ptr;
6939 DPRINTF(("delete node %u on %s page %"Z"u", indx,
6940 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)));
6941 numkeys = NUMKEYS(mp);
6942 mdb_cassert(mc, indx < numkeys);
6945 int x = numkeys - 1 - indx;
6946 base = LEAF2KEY(mp, indx, ksize);
6948 memmove(base, base + ksize, x * ksize);
6949 mp->mp_lower -= sizeof(indx_t);
6950 mp->mp_upper += ksize - sizeof(indx_t);
6954 node = NODEPTR(mp, indx);
6955 sz = NODESIZE + node->mn_ksize;
6957 if (F_ISSET(node->mn_flags, F_BIGDATA))
6958 sz += sizeof(pgno_t);
6960 sz += NODEDSZ(node);
6964 ptr = mp->mp_ptrs[indx];
6965 for (i = j = 0; i < numkeys; i++) {
6967 mp->mp_ptrs[j] = mp->mp_ptrs[i];
6968 if (mp->mp_ptrs[i] < ptr)
6969 mp->mp_ptrs[j] += sz;
6974 base = (char *)mp + mp->mp_upper + PAGEBASE;
6975 memmove(base + sz, base, ptr - mp->mp_upper);
6977 mp->mp_lower -= sizeof(indx_t);
6981 /** Compact the main page after deleting a node on a subpage.
6982 * @param[in] mp The main page to operate on.
6983 * @param[in] indx The index of the subpage on the main page.
6986 mdb_node_shrink(MDB_page *mp, indx_t indx)
6992 indx_t i, numkeys, ptr;
6994 node = NODEPTR(mp, indx);
6995 sp = (MDB_page *)NODEDATA(node);
6996 delta = SIZELEFT(sp);
6997 xp = (MDB_page *)((char *)sp + delta);
6999 /* shift subpage upward */
7001 nsize = NUMKEYS(sp) * sp->mp_pad;
7003 return; /* do not make the node uneven-sized */
7004 memmove(METADATA(xp), METADATA(sp), nsize);
7007 numkeys = NUMKEYS(sp);
7008 for (i=numkeys-1; i>=0; i--)
7009 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
7011 xp->mp_upper = sp->mp_lower;
7012 xp->mp_lower = sp->mp_lower;
7013 xp->mp_flags = sp->mp_flags;
7014 xp->mp_pad = sp->mp_pad;
7015 COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
7017 nsize = NODEDSZ(node) - delta;
7018 SETDSZ(node, nsize);
7020 /* shift lower nodes upward */
7021 ptr = mp->mp_ptrs[indx];
7022 numkeys = NUMKEYS(mp);
7023 for (i = 0; i < numkeys; i++) {
7024 if (mp->mp_ptrs[i] <= ptr)
7025 mp->mp_ptrs[i] += delta;
7028 base = (char *)mp + mp->mp_upper + PAGEBASE;
7029 memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
7030 mp->mp_upper += delta;
7033 /** Initial setup of a sorted-dups cursor.
7034 * Sorted duplicates are implemented as a sub-database for the given key.
7035 * The duplicate data items are actually keys of the sub-database.
7036 * Operations on the duplicate data items are performed using a sub-cursor
7037 * initialized when the sub-database is first accessed. This function does
7038 * the preliminary setup of the sub-cursor, filling in the fields that
7039 * depend only on the parent DB.
7040 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
7043 mdb_xcursor_init0(MDB_cursor *mc)
7045 MDB_xcursor *mx = mc->mc_xcursor;
7047 mx->mx_cursor.mc_xcursor = NULL;
7048 mx->mx_cursor.mc_txn = mc->mc_txn;
7049 mx->mx_cursor.mc_db = &mx->mx_db;
7050 mx->mx_cursor.mc_dbx = &mx->mx_dbx;
7051 mx->mx_cursor.mc_dbi = mc->mc_dbi;
7052 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
7053 mx->mx_cursor.mc_snum = 0;
7054 mx->mx_cursor.mc_top = 0;
7055 mx->mx_cursor.mc_flags = C_SUB;
7056 mx->mx_dbx.md_name.mv_size = 0;
7057 mx->mx_dbx.md_name.mv_data = NULL;
7058 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
7059 mx->mx_dbx.md_dcmp = NULL;
7060 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
7063 /** Final setup of a sorted-dups cursor.
7064 * Sets up the fields that depend on the data from the main cursor.
7065 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
7066 * @param[in] node The data containing the #MDB_db record for the
7067 * sorted-dup database.
7070 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
7072 MDB_xcursor *mx = mc->mc_xcursor;
7074 if (node->mn_flags & F_SUBDATA) {
7075 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
7076 mx->mx_cursor.mc_pg[0] = 0;
7077 mx->mx_cursor.mc_snum = 0;
7078 mx->mx_cursor.mc_top = 0;
7079 mx->mx_cursor.mc_flags = C_SUB;
7081 MDB_page *fp = NODEDATA(node);
7082 mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
7083 mx->mx_db.md_flags = 0;
7084 mx->mx_db.md_depth = 1;
7085 mx->mx_db.md_branch_pages = 0;
7086 mx->mx_db.md_leaf_pages = 1;
7087 mx->mx_db.md_overflow_pages = 0;
7088 mx->mx_db.md_entries = NUMKEYS(fp);
7089 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
7090 mx->mx_cursor.mc_snum = 1;
7091 mx->mx_cursor.mc_top = 0;
7092 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
7093 mx->mx_cursor.mc_pg[0] = fp;
7094 mx->mx_cursor.mc_ki[0] = 0;
7095 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
7096 mx->mx_db.md_flags = MDB_DUPFIXED;
7097 mx->mx_db.md_pad = fp->mp_pad;
7098 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
7099 mx->mx_db.md_flags |= MDB_INTEGERKEY;
7102 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
7103 mx->mx_db.md_root));
7104 mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
7105 #if UINT_MAX < SIZE_MAX
7106 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
7107 mx->mx_dbx.md_cmp = mdb_cmp_clong;
7111 /** Initialize a cursor for a given transaction and database. */
7113 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
7116 mc->mc_backup = NULL;
7119 mc->mc_db = &txn->mt_dbs[dbi];
7120 mc->mc_dbx = &txn->mt_dbxs[dbi];
7121 mc->mc_dbflag = &txn->mt_dbflags[dbi];
7126 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
7127 mdb_tassert(txn, mx != NULL);
7128 mc->mc_xcursor = mx;
7129 mdb_xcursor_init0(mc);
7131 mc->mc_xcursor = NULL;
7133 if (*mc->mc_dbflag & DB_STALE) {
7134 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
7139 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
7142 size_t size = sizeof(MDB_cursor);
7144 if (!ret || !TXN_DBI_EXIST(txn, dbi))
7147 if (txn->mt_flags & MDB_TXN_ERROR)
7150 /* Allow read access to the freelist */
7151 if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
7154 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
7155 size += sizeof(MDB_xcursor);
7157 if ((mc = malloc(size)) != NULL) {
7158 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1));
7159 if (txn->mt_cursors) {
7160 mc->mc_next = txn->mt_cursors[dbi];
7161 txn->mt_cursors[dbi] = mc;
7162 mc->mc_flags |= C_UNTRACK;
7174 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc)
7176 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi))
7179 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
7182 if (txn->mt_flags & MDB_TXN_ERROR)
7185 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
7189 /* Return the count of duplicate data items for the current key */
7191 mdb_cursor_count(MDB_cursor *mc, size_t *countp)
7195 if (mc == NULL || countp == NULL)
7198 if (mc->mc_xcursor == NULL)
7199 return MDB_INCOMPATIBLE;
7201 if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
7204 if (!(mc->mc_flags & C_INITIALIZED))
7207 if (!mc->mc_snum || (mc->mc_flags & C_EOF))
7208 return MDB_NOTFOUND;
7210 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7211 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7214 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
7217 *countp = mc->mc_xcursor->mx_db.md_entries;
7223 mdb_cursor_close(MDB_cursor *mc)
7225 if (mc && !mc->mc_backup) {
7226 /* remove from txn, if tracked */
7227 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
7228 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
7229 while (*prev && *prev != mc) prev = &(*prev)->mc_next;
7231 *prev = mc->mc_next;
7238 mdb_cursor_txn(MDB_cursor *mc)
7240 if (!mc) return NULL;
7245 mdb_cursor_dbi(MDB_cursor *mc)
7250 /** Replace the key for a branch node with a new key.
7251 * @param[in] mc Cursor pointing to the node to operate on.
7252 * @param[in] key The new key to use.
7253 * @return 0 on success, non-zero on failure.
7256 mdb_update_key(MDB_cursor *mc, MDB_val *key)
7262 int delta, ksize, oksize;
7263 indx_t ptr, i, numkeys, indx;
7266 indx = mc->mc_ki[mc->mc_top];
7267 mp = mc->mc_pg[mc->mc_top];
7268 node = NODEPTR(mp, indx);
7269 ptr = mp->mp_ptrs[indx];
7273 char kbuf2[DKBUF_MAXKEYSIZE*2+1];
7274 k2.mv_data = NODEKEY(node);
7275 k2.mv_size = node->mn_ksize;
7276 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u",
7278 mdb_dkey(&k2, kbuf2),
7284 /* Sizes must be 2-byte aligned. */
7285 ksize = EVEN(key->mv_size);
7286 oksize = EVEN(node->mn_ksize);
7287 delta = ksize - oksize;
7289 /* Shift node contents if EVEN(key length) changed. */
7291 if (delta > 0 && SIZELEFT(mp) < delta) {
7293 /* not enough space left, do a delete and split */
7294 DPRINTF(("Not enough room, delta = %d, splitting...", delta));
7295 pgno = NODEPGNO(node);
7296 mdb_node_del(mc, 0);
7297 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE);
7300 numkeys = NUMKEYS(mp);
7301 for (i = 0; i < numkeys; i++) {
7302 if (mp->mp_ptrs[i] <= ptr)
7303 mp->mp_ptrs[i] -= delta;
7306 base = (char *)mp + mp->mp_upper + PAGEBASE;
7307 len = ptr - mp->mp_upper + NODESIZE;
7308 memmove(base - delta, base, len);
7309 mp->mp_upper -= delta;
7311 node = NODEPTR(mp, indx);
7314 /* But even if no shift was needed, update ksize */
7315 if (node->mn_ksize != key->mv_size)
7316 node->mn_ksize = key->mv_size;
7319 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
7325 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
7327 /** Move a node from csrc to cdst.
7330 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7337 unsigned short flags;
7341 /* Mark src and dst as dirty. */
7342 if ((rc = mdb_page_touch(csrc)) ||
7343 (rc = mdb_page_touch(cdst)))
7346 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7347 key.mv_size = csrc->mc_db->md_pad;
7348 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
7350 data.mv_data = NULL;
7354 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
7355 mdb_cassert(csrc, !((size_t)srcnode & 1));
7356 srcpg = NODEPGNO(srcnode);
7357 flags = srcnode->mn_flags;
7358 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7359 unsigned int snum = csrc->mc_snum;
7361 /* must find the lowest key below src */
7362 rc = mdb_page_search_lowest(csrc);
7365 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7366 key.mv_size = csrc->mc_db->md_pad;
7367 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7369 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7370 key.mv_size = NODEKSZ(s2);
7371 key.mv_data = NODEKEY(s2);
7373 csrc->mc_snum = snum--;
7374 csrc->mc_top = snum;
7376 key.mv_size = NODEKSZ(srcnode);
7377 key.mv_data = NODEKEY(srcnode);
7379 data.mv_size = NODEDSZ(srcnode);
7380 data.mv_data = NODEDATA(srcnode);
7382 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
7383 unsigned int snum = cdst->mc_snum;
7386 /* must find the lowest key below dst */
7387 mdb_cursor_copy(cdst, &mn);
7388 rc = mdb_page_search_lowest(&mn);
7391 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7392 bkey.mv_size = mn.mc_db->md_pad;
7393 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
7395 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7396 bkey.mv_size = NODEKSZ(s2);
7397 bkey.mv_data = NODEKEY(s2);
7399 mn.mc_snum = snum--;
7402 rc = mdb_update_key(&mn, &bkey);
7407 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u",
7408 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
7409 csrc->mc_ki[csrc->mc_top],
7411 csrc->mc_pg[csrc->mc_top]->mp_pgno,
7412 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
7414 /* Add the node to the destination page.
7416 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
7417 if (rc != MDB_SUCCESS)
7420 /* Delete the node from the source page.
7422 mdb_node_del(csrc, key.mv_size);
7425 /* Adjust other cursors pointing to mp */
7426 MDB_cursor *m2, *m3;
7427 MDB_dbi dbi = csrc->mc_dbi;
7428 MDB_page *mp = csrc->mc_pg[csrc->mc_top];
7430 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7431 if (csrc->mc_flags & C_SUB)
7432 m3 = &m2->mc_xcursor->mx_cursor;
7435 if (m3 == csrc) continue;
7436 if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
7437 csrc->mc_ki[csrc->mc_top]) {
7438 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
7439 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
7444 /* Update the parent separators.
7446 if (csrc->mc_ki[csrc->mc_top] == 0) {
7447 if (csrc->mc_ki[csrc->mc_top-1] != 0) {
7448 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7449 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7451 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7452 key.mv_size = NODEKSZ(srcnode);
7453 key.mv_data = NODEKEY(srcnode);
7455 DPRINTF(("update separator for source page %"Z"u to [%s]",
7456 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
7457 mdb_cursor_copy(csrc, &mn);
7460 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7463 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7465 indx_t ix = csrc->mc_ki[csrc->mc_top];
7466 nullkey.mv_size = 0;
7467 csrc->mc_ki[csrc->mc_top] = 0;
7468 rc = mdb_update_key(csrc, &nullkey);
7469 csrc->mc_ki[csrc->mc_top] = ix;
7470 mdb_cassert(csrc, rc == MDB_SUCCESS);
7474 if (cdst->mc_ki[cdst->mc_top] == 0) {
7475 if (cdst->mc_ki[cdst->mc_top-1] != 0) {
7476 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7477 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
7479 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
7480 key.mv_size = NODEKSZ(srcnode);
7481 key.mv_data = NODEKEY(srcnode);
7483 DPRINTF(("update separator for destination page %"Z"u to [%s]",
7484 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
7485 mdb_cursor_copy(cdst, &mn);
7488 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7491 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
7493 indx_t ix = cdst->mc_ki[cdst->mc_top];
7494 nullkey.mv_size = 0;
7495 cdst->mc_ki[cdst->mc_top] = 0;
7496 rc = mdb_update_key(cdst, &nullkey);
7497 cdst->mc_ki[cdst->mc_top] = ix;
7498 mdb_cassert(csrc, rc == MDB_SUCCESS);
7505 /** Merge one page into another.
7506 * The nodes from the page pointed to by \b csrc will
7507 * be copied to the page pointed to by \b cdst and then
7508 * the \b csrc page will be freed.
7509 * @param[in] csrc Cursor pointing to the source page.
7510 * @param[in] cdst Cursor pointing to the destination page.
7511 * @return 0 on success, non-zero on failure.
7514 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7516 MDB_page *psrc, *pdst;
7523 psrc = csrc->mc_pg[csrc->mc_top];
7524 pdst = cdst->mc_pg[cdst->mc_top];
7526 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno));
7528 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
7529 mdb_cassert(csrc, cdst->mc_snum > 1);
7531 /* Mark dst as dirty. */
7532 if ((rc = mdb_page_touch(cdst)))
7535 /* Move all nodes from src to dst.
7537 j = nkeys = NUMKEYS(pdst);
7538 if (IS_LEAF2(psrc)) {
7539 key.mv_size = csrc->mc_db->md_pad;
7540 key.mv_data = METADATA(psrc);
7541 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7542 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
7543 if (rc != MDB_SUCCESS)
7545 key.mv_data = (char *)key.mv_data + key.mv_size;
7548 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7549 srcnode = NODEPTR(psrc, i);
7550 if (i == 0 && IS_BRANCH(psrc)) {
7553 mdb_cursor_copy(csrc, &mn);
7554 /* must find the lowest key below src */
7555 rc = mdb_page_search_lowest(&mn);
7558 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7559 key.mv_size = mn.mc_db->md_pad;
7560 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
7562 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7563 key.mv_size = NODEKSZ(s2);
7564 key.mv_data = NODEKEY(s2);
7567 key.mv_size = srcnode->mn_ksize;
7568 key.mv_data = NODEKEY(srcnode);
7571 data.mv_size = NODEDSZ(srcnode);
7572 data.mv_data = NODEDATA(srcnode);
7573 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
7574 if (rc != MDB_SUCCESS)
7579 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
7580 pdst->mp_pgno, NUMKEYS(pdst),
7581 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
7583 /* Unlink the src page from parent and add to free list.
7586 mdb_node_del(csrc, 0);
7587 if (csrc->mc_ki[csrc->mc_top] == 0) {
7589 rc = mdb_update_key(csrc, &key);
7597 psrc = csrc->mc_pg[csrc->mc_top];
7598 /* If not operating on FreeDB, allow this page to be reused
7599 * in this txn. Otherwise just add to free list.
7601 rc = mdb_page_loose(csrc, psrc);
7605 csrc->mc_db->md_leaf_pages--;
7607 csrc->mc_db->md_branch_pages--;
7609 /* Adjust other cursors pointing to mp */
7610 MDB_cursor *m2, *m3;
7611 MDB_dbi dbi = csrc->mc_dbi;
7613 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7614 if (csrc->mc_flags & C_SUB)
7615 m3 = &m2->mc_xcursor->mx_cursor;
7618 if (m3 == csrc) continue;
7619 if (m3->mc_snum < csrc->mc_snum) continue;
7620 if (m3->mc_pg[csrc->mc_top] == psrc) {
7621 m3->mc_pg[csrc->mc_top] = pdst;
7622 m3->mc_ki[csrc->mc_top] += nkeys;
7627 unsigned int snum = cdst->mc_snum;
7628 uint16_t depth = cdst->mc_db->md_depth;
7629 mdb_cursor_pop(cdst);
7630 rc = mdb_rebalance(cdst);
7631 /* Did the tree shrink? */
7632 if (depth > cdst->mc_db->md_depth)
7634 cdst->mc_snum = snum;
7635 cdst->mc_top = snum-1;
7640 /** Copy the contents of a cursor.
7641 * @param[in] csrc The cursor to copy from.
7642 * @param[out] cdst The cursor to copy to.
7645 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
7649 cdst->mc_txn = csrc->mc_txn;
7650 cdst->mc_dbi = csrc->mc_dbi;
7651 cdst->mc_db = csrc->mc_db;
7652 cdst->mc_dbx = csrc->mc_dbx;
7653 cdst->mc_snum = csrc->mc_snum;
7654 cdst->mc_top = csrc->mc_top;
7655 cdst->mc_flags = csrc->mc_flags;
7657 for (i=0; i<csrc->mc_snum; i++) {
7658 cdst->mc_pg[i] = csrc->mc_pg[i];
7659 cdst->mc_ki[i] = csrc->mc_ki[i];
7663 /** Rebalance the tree after a delete operation.
7664 * @param[in] mc Cursor pointing to the page where rebalancing
7666 * @return 0 on success, non-zero on failure.
7669 mdb_rebalance(MDB_cursor *mc)
7673 unsigned int ptop, minkeys;
7677 minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
7678 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
7679 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
7680 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
7681 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
7683 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD &&
7684 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
7685 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold",
7686 mdb_dbg_pgno(mc->mc_pg[mc->mc_top])));
7690 if (mc->mc_snum < 2) {
7691 MDB_page *mp = mc->mc_pg[0];
7693 DPUTS("Can't rebalance a subpage, ignoring");
7696 if (NUMKEYS(mp) == 0) {
7697 DPUTS("tree is completely empty");
7698 mc->mc_db->md_root = P_INVALID;
7699 mc->mc_db->md_depth = 0;
7700 mc->mc_db->md_leaf_pages = 0;
7701 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7704 /* Adjust cursors pointing to mp */
7707 mc->mc_flags &= ~C_INITIALIZED;
7709 MDB_cursor *m2, *m3;
7710 MDB_dbi dbi = mc->mc_dbi;
7712 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7713 if (mc->mc_flags & C_SUB)
7714 m3 = &m2->mc_xcursor->mx_cursor;
7717 if (m3->mc_snum < mc->mc_snum) continue;
7718 if (m3->mc_pg[0] == mp) {
7721 m3->mc_flags &= ~C_INITIALIZED;
7725 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
7727 DPUTS("collapsing root page!");
7728 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7731 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
7732 rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL);
7735 mc->mc_db->md_depth--;
7736 mc->mc_db->md_branch_pages--;
7737 mc->mc_ki[0] = mc->mc_ki[1];
7738 for (i = 1; i<mc->mc_db->md_depth; i++) {
7739 mc->mc_pg[i] = mc->mc_pg[i+1];
7740 mc->mc_ki[i] = mc->mc_ki[i+1];
7743 /* Adjust other cursors pointing to mp */
7744 MDB_cursor *m2, *m3;
7745 MDB_dbi dbi = mc->mc_dbi;
7747 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7748 if (mc->mc_flags & C_SUB)
7749 m3 = &m2->mc_xcursor->mx_cursor;
7752 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
7753 if (m3->mc_pg[0] == mp) {
7756 for (i=0; i<m3->mc_snum; i++) {
7757 m3->mc_pg[i] = m3->mc_pg[i+1];
7758 m3->mc_ki[i] = m3->mc_ki[i+1];
7764 DPUTS("root page doesn't need rebalancing");
7768 /* The parent (branch page) must have at least 2 pointers,
7769 * otherwise the tree is invalid.
7771 ptop = mc->mc_top-1;
7772 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
7774 /* Leaf page fill factor is below the threshold.
7775 * Try to move keys from left or right neighbor, or
7776 * merge with a neighbor page.
7781 mdb_cursor_copy(mc, &mn);
7782 mn.mc_xcursor = NULL;
7784 oldki = mc->mc_ki[mc->mc_top];
7785 if (mc->mc_ki[ptop] == 0) {
7786 /* We're the leftmost leaf in our parent.
7788 DPUTS("reading right neighbor");
7790 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7791 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7794 mn.mc_ki[mn.mc_top] = 0;
7795 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
7797 /* There is at least one neighbor to the left.
7799 DPUTS("reading left neighbor");
7801 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7802 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7805 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
7806 mc->mc_ki[mc->mc_top] = 0;
7809 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)",
7810 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
7811 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
7813 /* If the neighbor page is above threshold and has enough keys,
7814 * move one key from it. Otherwise we should try to merge them.
7815 * (A branch page must never have less than 2 keys.)
7817 minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top]));
7818 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
7819 rc = mdb_node_move(&mn, mc);
7820 if (mc->mc_ki[ptop]) {
7824 if (mc->mc_ki[ptop] == 0) {
7825 rc = mdb_page_merge(&mn, mc);
7827 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
7828 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
7829 rc = mdb_page_merge(mc, &mn);
7830 mdb_cursor_copy(&mn, mc);
7832 mc->mc_flags &= ~C_EOF;
7834 mc->mc_ki[mc->mc_top] = oldki;
7838 /** Complete a delete operation started by #mdb_cursor_del(). */
7840 mdb_cursor_del0(MDB_cursor *mc)
7847 ki = mc->mc_ki[mc->mc_top];
7848 mdb_node_del(mc, mc->mc_db->md_pad);
7849 mc->mc_db->md_entries--;
7850 rc = mdb_rebalance(mc);
7852 if (rc == MDB_SUCCESS) {
7853 MDB_cursor *m2, *m3;
7854 MDB_dbi dbi = mc->mc_dbi;
7856 mp = mc->mc_pg[mc->mc_top];
7857 nkeys = NUMKEYS(mp);
7859 /* if mc points past last node in page, find next sibling */
7860 if (mc->mc_ki[mc->mc_top] >= nkeys) {
7861 rc = mdb_cursor_sibling(mc, 1);
7862 if (rc == MDB_NOTFOUND) {
7863 mc->mc_flags |= C_EOF;
7868 /* Adjust other cursors pointing to mp */
7869 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) {
7870 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
7871 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
7873 if (m3 == mc || m3->mc_snum < mc->mc_snum)
7875 if (m3->mc_pg[mc->mc_top] == mp) {
7876 if (m3->mc_ki[mc->mc_top] >= ki) {
7877 m3->mc_flags |= C_DEL;
7878 if (m3->mc_ki[mc->mc_top] > ki)
7879 m3->mc_ki[mc->mc_top]--;
7880 else if (mc->mc_db->md_flags & MDB_DUPSORT)
7881 m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
7883 if (m3->mc_ki[mc->mc_top] >= nkeys) {
7884 rc = mdb_cursor_sibling(m3, 1);
7885 if (rc == MDB_NOTFOUND) {
7886 m3->mc_flags |= C_EOF;
7892 mc->mc_flags |= C_DEL;
7896 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7901 mdb_del(MDB_txn *txn, MDB_dbi dbi,
7902 MDB_val *key, MDB_val *data)
7904 if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
7907 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7908 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7910 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) {
7911 /* must ignore any data */
7915 return mdb_del0(txn, dbi, key, data, 0);
7919 mdb_del0(MDB_txn *txn, MDB_dbi dbi,
7920 MDB_val *key, MDB_val *data, unsigned flags)
7925 MDB_val rdata, *xdata;
7929 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
7931 mdb_cursor_init(&mc, txn, dbi, &mx);
7940 flags |= MDB_NODUPDATA;
7942 rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
7944 /* let mdb_page_split know about this cursor if needed:
7945 * delete will trigger a rebalance; if it needs to move
7946 * a node from one page to another, it will have to
7947 * update the parent's separator key(s). If the new sepkey
7948 * is larger than the current one, the parent page may
7949 * run out of space, triggering a split. We need this
7950 * cursor to be consistent until the end of the rebalance.
7952 mc.mc_flags |= C_UNTRACK;
7953 mc.mc_next = txn->mt_cursors[dbi];
7954 txn->mt_cursors[dbi] = &mc;
7955 rc = mdb_cursor_del(&mc, flags);
7956 txn->mt_cursors[dbi] = mc.mc_next;
7961 /** Split a page and insert a new node.
7962 * @param[in,out] mc Cursor pointing to the page and desired insertion index.
7963 * The cursor will be updated to point to the actual page and index where
7964 * the node got inserted after the split.
7965 * @param[in] newkey The key for the newly inserted node.
7966 * @param[in] newdata The data for the newly inserted node.
7967 * @param[in] newpgno The page number, if the new node is a branch node.
7968 * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
7969 * @return 0 on success, non-zero on failure.
7972 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
7973 unsigned int nflags)
7976 int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
7979 int i, j, split_indx, nkeys, pmax;
7980 MDB_env *env = mc->mc_txn->mt_env;
7982 MDB_val sepkey, rkey, xdata, *rdata = &xdata;
7983 MDB_page *copy = NULL;
7984 MDB_page *mp, *rp, *pp;
7989 mp = mc->mc_pg[mc->mc_top];
7990 newindx = mc->mc_ki[mc->mc_top];
7991 nkeys = NUMKEYS(mp);
7993 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
7994 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
7995 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
7997 /* Create a right sibling. */
7998 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
8000 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno));
8002 if (mc->mc_snum < 2) {
8003 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
8005 /* shift current top to make room for new parent */
8006 mc->mc_pg[1] = mc->mc_pg[0];
8007 mc->mc_ki[1] = mc->mc_ki[0];
8010 mc->mc_db->md_root = pp->mp_pgno;
8011 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno));
8012 mc->mc_db->md_depth++;
8015 /* Add left (implicit) pointer. */
8016 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
8017 /* undo the pre-push */
8018 mc->mc_pg[0] = mc->mc_pg[1];
8019 mc->mc_ki[0] = mc->mc_ki[1];
8020 mc->mc_db->md_root = mp->mp_pgno;
8021 mc->mc_db->md_depth--;
8028 ptop = mc->mc_top-1;
8029 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno));
8032 mc->mc_flags |= C_SPLITTING;
8033 mdb_cursor_copy(mc, &mn);
8034 mn.mc_pg[mn.mc_top] = rp;
8035 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
8037 if (nflags & MDB_APPEND) {
8038 mn.mc_ki[mn.mc_top] = 0;
8040 split_indx = newindx;
8044 split_indx = (nkeys+1) / 2;
8049 unsigned int lsize, rsize, ksize;
8050 /* Move half of the keys to the right sibling */
8051 x = mc->mc_ki[mc->mc_top] - split_indx;
8052 ksize = mc->mc_db->md_pad;
8053 split = LEAF2KEY(mp, split_indx, ksize);
8054 rsize = (nkeys - split_indx) * ksize;
8055 lsize = (nkeys - split_indx) * sizeof(indx_t);
8056 mp->mp_lower -= lsize;
8057 rp->mp_lower += lsize;
8058 mp->mp_upper += rsize - lsize;
8059 rp->mp_upper -= rsize - lsize;
8060 sepkey.mv_size = ksize;
8061 if (newindx == split_indx) {
8062 sepkey.mv_data = newkey->mv_data;
8064 sepkey.mv_data = split;
8067 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
8068 memcpy(rp->mp_ptrs, split, rsize);
8069 sepkey.mv_data = rp->mp_ptrs;
8070 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
8071 memcpy(ins, newkey->mv_data, ksize);
8072 mp->mp_lower += sizeof(indx_t);
8073 mp->mp_upper -= ksize - sizeof(indx_t);
8076 memcpy(rp->mp_ptrs, split, x * ksize);
8077 ins = LEAF2KEY(rp, x, ksize);
8078 memcpy(ins, newkey->mv_data, ksize);
8079 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
8080 rp->mp_lower += sizeof(indx_t);
8081 rp->mp_upper -= ksize - sizeof(indx_t);
8082 mc->mc_ki[mc->mc_top] = x;
8083 mc->mc_pg[mc->mc_top] = rp;
8086 int psize, nsize, k;
8087 /* Maximum free space in an empty page */
8088 pmax = env->me_psize - PAGEHDRSZ;
8090 nsize = mdb_leaf_size(env, newkey, newdata);
8092 nsize = mdb_branch_size(env, newkey);
8093 nsize = EVEN(nsize);
8095 /* grab a page to hold a temporary copy */
8096 copy = mdb_page_malloc(mc->mc_txn, 1);
8101 copy->mp_pgno = mp->mp_pgno;
8102 copy->mp_flags = mp->mp_flags;
8103 copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
8104 copy->mp_upper = env->me_psize - PAGEBASE;
8106 /* prepare to insert */
8107 for (i=0, j=0; i<nkeys; i++) {
8109 copy->mp_ptrs[j++] = 0;
8111 copy->mp_ptrs[j++] = mp->mp_ptrs[i];
8114 /* When items are relatively large the split point needs
8115 * to be checked, because being off-by-one will make the
8116 * difference between success or failure in mdb_node_add.
8118 * It's also relevant if a page happens to be laid out
8119 * such that one half of its nodes are all "small" and
8120 * the other half of its nodes are "large." If the new
8121 * item is also "large" and falls on the half with
8122 * "large" nodes, it also may not fit.
8124 * As a final tweak, if the new item goes on the last
8125 * spot on the page (and thus, onto the new page), bias
8126 * the split so the new page is emptier than the old page.
8127 * This yields better packing during sequential inserts.
8129 if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
8130 /* Find split point */
8132 if (newindx <= split_indx || newindx >= nkeys) {
8134 k = newindx >= nkeys ? nkeys : split_indx+2;
8139 for (; i!=k; i+=j) {
8144 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8145 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
8147 if (F_ISSET(node->mn_flags, F_BIGDATA))
8148 psize += sizeof(pgno_t);
8150 psize += NODEDSZ(node);
8152 psize = EVEN(psize);
8154 if (psize > pmax || i == k-j) {
8155 split_indx = i + (j<0);
8160 if (split_indx == newindx) {
8161 sepkey.mv_size = newkey->mv_size;
8162 sepkey.mv_data = newkey->mv_data;
8164 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
8165 sepkey.mv_size = node->mn_ksize;
8166 sepkey.mv_data = NODEKEY(node);
8171 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
8173 /* Copy separator key to the parent.
8175 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
8179 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
8184 if (mn.mc_snum == mc->mc_snum) {
8185 mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
8186 mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
8187 mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
8188 mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
8193 /* Right page might now have changed parent.
8194 * Check if left page also changed parent.
8196 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8197 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8198 for (i=0; i<ptop; i++) {
8199 mc->mc_pg[i] = mn.mc_pg[i];
8200 mc->mc_ki[i] = mn.mc_ki[i];
8202 mc->mc_pg[ptop] = mn.mc_pg[ptop];
8203 if (mn.mc_ki[ptop]) {
8204 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
8206 /* find right page's left sibling */
8207 mc->mc_ki[ptop] = mn.mc_ki[ptop];
8208 mdb_cursor_sibling(mc, 0);
8213 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
8216 mc->mc_flags ^= C_SPLITTING;
8217 if (rc != MDB_SUCCESS) {
8220 if (nflags & MDB_APPEND) {
8221 mc->mc_pg[mc->mc_top] = rp;
8222 mc->mc_ki[mc->mc_top] = 0;
8223 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
8226 for (i=0; i<mc->mc_top; i++)
8227 mc->mc_ki[i] = mn.mc_ki[i];
8228 } else if (!IS_LEAF2(mp)) {
8230 mc->mc_pg[mc->mc_top] = rp;
8235 rkey.mv_data = newkey->mv_data;
8236 rkey.mv_size = newkey->mv_size;
8242 /* Update index for the new key. */
8243 mc->mc_ki[mc->mc_top] = j;
8245 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8246 rkey.mv_data = NODEKEY(node);
8247 rkey.mv_size = node->mn_ksize;
8249 xdata.mv_data = NODEDATA(node);
8250 xdata.mv_size = NODEDSZ(node);
8253 pgno = NODEPGNO(node);
8254 flags = node->mn_flags;
8257 if (!IS_LEAF(mp) && j == 0) {
8258 /* First branch index doesn't need key data. */
8262 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
8268 mc->mc_pg[mc->mc_top] = copy;
8273 } while (i != split_indx);
8275 nkeys = NUMKEYS(copy);
8276 for (i=0; i<nkeys; i++)
8277 mp->mp_ptrs[i] = copy->mp_ptrs[i];
8278 mp->mp_lower = copy->mp_lower;
8279 mp->mp_upper = copy->mp_upper;
8280 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
8281 env->me_psize - copy->mp_upper - PAGEBASE);
8283 /* reset back to original page */
8284 if (newindx < split_indx) {
8285 mc->mc_pg[mc->mc_top] = mp;
8286 if (nflags & MDB_RESERVE) {
8287 node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8288 if (!(node->mn_flags & F_BIGDATA))
8289 newdata->mv_data = NODEDATA(node);
8292 mc->mc_pg[mc->mc_top] = rp;
8294 /* Make sure mc_ki is still valid.
8296 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8297 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8298 for (i=0; i<=ptop; i++) {
8299 mc->mc_pg[i] = mn.mc_pg[i];
8300 mc->mc_ki[i] = mn.mc_ki[i];
8307 /* Adjust other cursors pointing to mp */
8308 MDB_cursor *m2, *m3;
8309 MDB_dbi dbi = mc->mc_dbi;
8310 int fixup = NUMKEYS(mp);
8312 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8313 if (mc->mc_flags & C_SUB)
8314 m3 = &m2->mc_xcursor->mx_cursor;
8319 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
8321 if (m3->mc_flags & C_SPLITTING)
8326 for (k=m3->mc_top; k>=0; k--) {
8327 m3->mc_ki[k+1] = m3->mc_ki[k];
8328 m3->mc_pg[k+1] = m3->mc_pg[k];
8330 if (m3->mc_ki[0] >= split_indx) {
8335 m3->mc_pg[0] = mc->mc_pg[0];
8339 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
8340 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
8341 m3->mc_ki[mc->mc_top]++;
8342 if (m3->mc_ki[mc->mc_top] >= fixup) {
8343 m3->mc_pg[mc->mc_top] = rp;
8344 m3->mc_ki[mc->mc_top] -= fixup;
8345 m3->mc_ki[ptop] = mn.mc_ki[ptop];
8347 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
8348 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
8353 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
8356 if (copy) /* tmp page */
8357 mdb_page_free(env, copy);
8359 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
8364 mdb_put(MDB_txn *txn, MDB_dbi dbi,
8365 MDB_val *key, MDB_val *data, unsigned int flags)
8370 if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
8373 if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
8376 mdb_cursor_init(&mc, txn, dbi, &mx);
8377 return mdb_cursor_put(&mc, key, data, flags);
8381 #define MDB_WBUF (1024*1024)
8384 /** State needed for a compacting copy. */
8385 typedef struct mdb_copy {
8386 pthread_mutex_t mc_mutex;
8387 pthread_cond_t mc_cond;
8394 pgno_t mc_next_pgno;
8397 volatile int mc_new;
8402 /** Dedicated writer thread for compacting copy. */
8403 static THREAD_RET ESECT
8404 mdb_env_copythr(void *arg)
8408 int toggle = 0, wsize, rc;
8411 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8414 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8417 pthread_mutex_lock(&my->mc_mutex);
8419 pthread_cond_signal(&my->mc_cond);
8422 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8423 if (my->mc_new < 0) {
8428 wsize = my->mc_wlen[toggle];
8429 ptr = my->mc_wbuf[toggle];
8432 DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
8436 } else if (len > 0) {
8450 /* If there's an overflow page tail, write it too */
8451 if (my->mc_olen[toggle]) {
8452 wsize = my->mc_olen[toggle];
8453 ptr = my->mc_over[toggle];
8454 my->mc_olen[toggle] = 0;
8457 my->mc_wlen[toggle] = 0;
8459 pthread_cond_signal(&my->mc_cond);
8461 pthread_cond_signal(&my->mc_cond);
8462 pthread_mutex_unlock(&my->mc_mutex);
8463 return (THREAD_RET)0;
8467 /** Tell the writer thread there's a buffer ready to write */
8469 mdb_env_cthr_toggle(mdb_copy *my, int st)
8471 int toggle = my->mc_toggle ^ 1;
8472 pthread_mutex_lock(&my->mc_mutex);
8473 if (my->mc_status) {
8474 pthread_mutex_unlock(&my->mc_mutex);
8475 return my->mc_status;
8477 while (my->mc_new == 1)
8478 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8480 my->mc_toggle = toggle;
8481 pthread_cond_signal(&my->mc_cond);
8482 pthread_mutex_unlock(&my->mc_mutex);
8486 /** Depth-first tree traversal for compacting copy. */
8488 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
8491 MDB_txn *txn = my->mc_txn;
8493 MDB_page *mo, *mp, *leaf;
8498 /* Empty DB, nothing to do */
8499 if (*pg == P_INVALID)
8506 rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL);
8509 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
8513 /* Make cursor pages writable */
8514 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
8518 for (i=0; i<mc.mc_top; i++) {
8519 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
8520 mc.mc_pg[i] = (MDB_page *)ptr;
8521 ptr += my->mc_env->me_psize;
8524 /* This is writable space for a leaf page. Usually not needed. */
8525 leaf = (MDB_page *)ptr;
8527 toggle = my->mc_toggle;
8528 while (mc.mc_snum > 0) {
8530 mp = mc.mc_pg[mc.mc_top];
8534 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
8535 for (i=0; i<n; i++) {
8536 ni = NODEPTR(mp, i);
8537 if (ni->mn_flags & F_BIGDATA) {
8541 /* Need writable leaf */
8543 mc.mc_pg[mc.mc_top] = leaf;
8544 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8546 ni = NODEPTR(mp, i);
8549 memcpy(&pg, NODEDATA(ni), sizeof(pg));
8550 rc = mdb_page_get(txn, pg, &omp, NULL);
8553 if (my->mc_wlen[toggle] >= MDB_WBUF) {
8554 rc = mdb_env_cthr_toggle(my, 1);
8557 toggle = my->mc_toggle;
8559 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8560 memcpy(mo, omp, my->mc_env->me_psize);
8561 mo->mp_pgno = my->mc_next_pgno;
8562 my->mc_next_pgno += omp->mp_pages;
8563 my->mc_wlen[toggle] += my->mc_env->me_psize;
8564 if (omp->mp_pages > 1) {
8565 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
8566 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
8567 rc = mdb_env_cthr_toggle(my, 1);
8570 toggle = my->mc_toggle;
8572 memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
8573 } else if (ni->mn_flags & F_SUBDATA) {
8576 /* Need writable leaf */
8578 mc.mc_pg[mc.mc_top] = leaf;
8579 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8581 ni = NODEPTR(mp, i);
8584 memcpy(&db, NODEDATA(ni), sizeof(db));
8585 my->mc_toggle = toggle;
8586 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
8589 toggle = my->mc_toggle;
8590 memcpy(NODEDATA(ni), &db, sizeof(db));
8595 mc.mc_ki[mc.mc_top]++;
8596 if (mc.mc_ki[mc.mc_top] < n) {
8599 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
8601 rc = mdb_page_get(txn, pg, &mp, NULL);
8606 mc.mc_ki[mc.mc_top] = 0;
8607 if (IS_BRANCH(mp)) {
8608 /* Whenever we advance to a sibling branch page,
8609 * we must proceed all the way down to its first leaf.
8611 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
8614 mc.mc_pg[mc.mc_top] = mp;
8618 if (my->mc_wlen[toggle] >= MDB_WBUF) {
8619 rc = mdb_env_cthr_toggle(my, 1);
8622 toggle = my->mc_toggle;
8624 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8625 mdb_page_copy(mo, mp, my->mc_env->me_psize);
8626 mo->mp_pgno = my->mc_next_pgno++;
8627 my->mc_wlen[toggle] += my->mc_env->me_psize;
8629 /* Update parent if there is one */
8630 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
8631 SETPGNO(ni, mo->mp_pgno);
8632 mdb_cursor_pop(&mc);
8634 /* Otherwise we're done */
8644 /** Copy environment with compaction. */
8646 mdb_env_copyfd1(MDB_env *env, HANDLE fd)
8651 MDB_txn *txn = NULL;
8656 my.mc_mutex = CreateMutex(NULL, FALSE, NULL);
8657 my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL);
8658 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize);
8659 if (my.mc_wbuf[0] == NULL)
8662 pthread_mutex_init(&my.mc_mutex, NULL);
8663 pthread_cond_init(&my.mc_cond, NULL);
8664 #ifdef HAVE_MEMALIGN
8665 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
8666 if (my.mc_wbuf[0] == NULL)
8669 rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2);
8674 memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
8675 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
8680 my.mc_next_pgno = 2;
8686 THREAD_CREATE(thr, mdb_env_copythr, &my);
8688 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8692 mp = (MDB_page *)my.mc_wbuf[0];
8693 memset(mp, 0, 2*env->me_psize);
8695 mp->mp_flags = P_META;
8696 mm = (MDB_meta *)METADATA(mp);
8697 mdb_env_init_meta0(env, mm);
8698 mm->mm_address = env->me_metas[0]->mm_address;
8700 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
8702 mp->mp_flags = P_META;
8703 *(MDB_meta *)METADATA(mp) = *mm;
8704 mm = (MDB_meta *)METADATA(mp);
8706 /* Count the number of free pages, subtract from lastpg to find
8707 * number of active pages
8710 MDB_ID freecount = 0;
8713 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
8714 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
8715 freecount += *(MDB_ID *)data.mv_data;
8716 freecount += txn->mt_dbs[0].md_branch_pages +
8717 txn->mt_dbs[0].md_leaf_pages +
8718 txn->mt_dbs[0].md_overflow_pages;
8720 /* Set metapage 1 */
8721 mm->mm_last_pg = txn->mt_next_pgno - freecount - 1;
8722 mm->mm_dbs[1] = txn->mt_dbs[1];
8723 if (mm->mm_last_pg > 1) {
8724 mm->mm_dbs[1].md_root = mm->mm_last_pg;
8727 mm->mm_dbs[1].md_root = P_INVALID;
8730 my.mc_wlen[0] = env->me_psize * 2;
8732 pthread_mutex_lock(&my.mc_mutex);
8734 pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8735 pthread_mutex_unlock(&my.mc_mutex);
8736 rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0);
8737 if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
8738 rc = mdb_env_cthr_toggle(&my, 1);
8739 mdb_env_cthr_toggle(&my, -1);
8740 pthread_mutex_lock(&my.mc_mutex);
8742 pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8743 pthread_mutex_unlock(&my.mc_mutex);
8748 CloseHandle(my.mc_cond);
8749 CloseHandle(my.mc_mutex);
8750 _aligned_free(my.mc_wbuf[0]);
8752 pthread_cond_destroy(&my.mc_cond);
8753 pthread_mutex_destroy(&my.mc_mutex);
8754 free(my.mc_wbuf[0]);
8759 /** Copy environment as-is. */
8761 mdb_env_copyfd0(MDB_env *env, HANDLE fd)
8763 MDB_txn *txn = NULL;
8764 mdb_mutex_t *wmutex = NULL;
8770 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8774 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8777 /* Do the lock/unlock of the reader mutex before starting the
8778 * write txn. Otherwise other read txns could block writers.
8780 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8785 /* We must start the actual read txn after blocking writers */
8786 mdb_txn_reset0(txn, "reset-stage1");
8788 /* Temporarily block writers until we snapshot the meta pages */
8789 wmutex = MDB_MUTEX(env, w);
8790 if (LOCK_MUTEX(rc, env, wmutex))
8793 rc = mdb_txn_renew0(txn);
8795 UNLOCK_MUTEX(wmutex);
8800 wsize = env->me_psize * 2;
8804 DO_WRITE(rc, fd, ptr, w2, len);
8808 } else if (len > 0) {
8814 /* Non-blocking or async handles are not supported */
8820 UNLOCK_MUTEX(wmutex);
8825 w2 = txn->mt_next_pgno * env->me_psize;
8828 if ((rc = mdb_fsize(env->me_fd, &fsize)))
8835 if (wsize > MAX_WRITE)
8839 DO_WRITE(rc, fd, ptr, w2, len);
8843 } else if (len > 0) {
8860 mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
8862 if (flags & MDB_CP_COMPACT)
8863 return mdb_env_copyfd1(env, fd);
8865 return mdb_env_copyfd0(env, fd);
8869 mdb_env_copyfd(MDB_env *env, HANDLE fd)
8871 return mdb_env_copyfd2(env, fd, 0);
8875 mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
8879 HANDLE newfd = INVALID_HANDLE_VALUE;
8881 if (env->me_flags & MDB_NOSUBDIR) {
8882 lpath = (char *)path;
8885 len += sizeof(DATANAME);
8886 lpath = malloc(len);
8889 sprintf(lpath, "%s" DATANAME, path);
8892 /* The destination path must exist, but the destination file must not.
8893 * We don't want the OS to cache the writes, since the source data is
8894 * already in the OS cache.
8897 newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
8898 FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
8900 newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
8902 if (newfd == INVALID_HANDLE_VALUE) {
8907 if (env->me_psize >= env->me_os_psize) {
8909 /* Set O_DIRECT if the file system supports it */
8910 if ((rc = fcntl(newfd, F_GETFL)) != -1)
8911 (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
8913 #ifdef F_NOCACHE /* __APPLE__ */
8914 rc = fcntl(newfd, F_NOCACHE, 1);
8922 rc = mdb_env_copyfd2(env, newfd, flags);
8925 if (!(env->me_flags & MDB_NOSUBDIR))
8927 if (newfd != INVALID_HANDLE_VALUE)
8928 if (close(newfd) < 0 && rc == MDB_SUCCESS)
8935 mdb_env_copy(MDB_env *env, const char *path)
8937 return mdb_env_copy2(env, path, 0);
8941 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8943 if (flag & (env->me_map ? ~CHANGEABLE : ~(CHANGEABLE|CHANGELESS)))
8946 env->me_flags |= flag;
8948 env->me_flags &= ~flag;
8953 mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8958 *arg = env->me_flags;
8963 mdb_env_set_userctx(MDB_env *env, void *ctx)
8967 env->me_userctx = ctx;
8972 mdb_env_get_userctx(MDB_env *env)
8974 return env ? env->me_userctx : NULL;
8978 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8983 env->me_assert_func = func;
8989 mdb_env_get_path(MDB_env *env, const char **arg)
8994 *arg = env->me_path;
8999 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
9008 /** Common code for #mdb_stat() and #mdb_env_stat().
9009 * @param[in] env the environment to operate in.
9010 * @param[in] db the #MDB_db record containing the stats to return.
9011 * @param[out] arg the address of an #MDB_stat structure to receive the stats.
9012 * @return 0, this function always succeeds.
9015 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
9017 arg->ms_psize = env->me_psize;
9018 arg->ms_depth = db->md_depth;
9019 arg->ms_branch_pages = db->md_branch_pages;
9020 arg->ms_leaf_pages = db->md_leaf_pages;
9021 arg->ms_overflow_pages = db->md_overflow_pages;
9022 arg->ms_entries = db->md_entries;
9028 mdb_env_stat(MDB_env *env, MDB_stat *arg)
9032 if (env == NULL || arg == NULL)
9035 toggle = mdb_env_pick_meta(env);
9037 return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
9041 mdb_env_info(MDB_env *env, MDB_envinfo *arg)
9045 if (env == NULL || arg == NULL)
9048 toggle = mdb_env_pick_meta(env);
9049 arg->me_mapaddr = env->me_metas[toggle]->mm_address;
9050 arg->me_mapsize = env->me_mapsize;
9051 arg->me_maxreaders = env->me_maxreaders;
9053 /* me_numreaders may be zero if this process never used any readers. Use
9054 * the shared numreader count if it exists.
9056 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders;
9058 arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
9059 arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
9063 /** Set the default comparison functions for a database.
9064 * Called immediately after a database is opened to set the defaults.
9065 * The user can then override them with #mdb_set_compare() or
9066 * #mdb_set_dupsort().
9067 * @param[in] txn A transaction handle returned by #mdb_txn_begin()
9068 * @param[in] dbi A database handle returned by #mdb_dbi_open()
9071 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
9073 uint16_t f = txn->mt_dbs[dbi].md_flags;
9075 txn->mt_dbxs[dbi].md_cmp =
9076 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
9077 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn;
9079 txn->mt_dbxs[dbi].md_dcmp =
9080 !(f & MDB_DUPSORT) ? 0 :
9081 ((f & MDB_INTEGERDUP)
9082 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint)
9083 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
9086 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
9092 int rc, dbflag, exact;
9093 unsigned int unused = 0, seq;
9096 if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
9097 mdb_default_cmp(txn, FREE_DBI);
9100 if ((flags & VALID_FLAGS) != flags)
9102 if (txn->mt_flags & MDB_TXN_ERROR)
9108 if (flags & PERSISTENT_FLAGS) {
9109 uint16_t f2 = flags & PERSISTENT_FLAGS;
9110 /* make sure flag changes get committed */
9111 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
9112 txn->mt_dbs[MAIN_DBI].md_flags |= f2;
9113 txn->mt_flags |= MDB_TXN_DIRTY;
9116 mdb_default_cmp(txn, MAIN_DBI);
9120 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
9121 mdb_default_cmp(txn, MAIN_DBI);
9124 /* Is the DB already open? */
9126 for (i=2; i<txn->mt_numdbs; i++) {
9127 if (!txn->mt_dbxs[i].md_name.mv_size) {
9128 /* Remember this free slot */
9129 if (!unused) unused = i;
9132 if (len == txn->mt_dbxs[i].md_name.mv_size &&
9133 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
9139 /* If no free slot and max hit, fail */
9140 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
9141 return MDB_DBS_FULL;
9143 /* Cannot mix named databases with some mainDB flags */
9144 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))
9145 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND;
9147 /* Find the DB info */
9148 dbflag = DB_NEW|DB_VALID;
9151 key.mv_data = (void *)name;
9152 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
9153 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
9154 if (rc == MDB_SUCCESS) {
9155 /* make sure this is actually a DB */
9156 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
9157 if (!(node->mn_flags & F_SUBDATA))
9158 return MDB_INCOMPATIBLE;
9159 } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
9160 /* Create if requested */
9161 data.mv_size = sizeof(MDB_db);
9162 data.mv_data = &dummy;
9163 memset(&dummy, 0, sizeof(dummy));
9164 dummy.md_root = P_INVALID;
9165 dummy.md_flags = flags & PERSISTENT_FLAGS;
9166 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
9170 /* OK, got info, add to table */
9171 if (rc == MDB_SUCCESS) {
9172 unsigned int slot = unused ? unused : txn->mt_numdbs;
9173 txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
9174 txn->mt_dbxs[slot].md_name.mv_size = len;
9175 txn->mt_dbxs[slot].md_rel = NULL;
9176 txn->mt_dbflags[slot] = dbflag;
9177 /* txn-> and env-> are the same in read txns, use
9178 * tmp variable to avoid undefined assignment
9180 seq = ++txn->mt_env->me_dbiseqs[slot];
9181 txn->mt_dbiseqs[slot] = seq;
9183 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
9185 mdb_default_cmp(txn, slot);
9194 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
9196 if (!arg || !TXN_DBI_EXIST(txn, dbi))
9199 if (txn->mt_flags & MDB_TXN_ERROR)
9202 if (txn->mt_dbflags[dbi] & DB_STALE) {
9205 /* Stale, must read the DB's root. cursor_init does it for us. */
9206 mdb_cursor_init(&mc, txn, dbi, &mx);
9208 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
9211 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
9214 if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
9216 ptr = env->me_dbxs[dbi].md_name.mv_data;
9217 /* If there was no name, this was already closed */
9219 env->me_dbxs[dbi].md_name.mv_data = NULL;
9220 env->me_dbxs[dbi].md_name.mv_size = 0;
9221 env->me_dbflags[dbi] = 0;
9222 env->me_dbiseqs[dbi]++;
9227 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
9229 /* We could return the flags for the FREE_DBI too but what's the point? */
9230 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9232 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
9236 /** Add all the DB's pages to the free list.
9237 * @param[in] mc Cursor on the DB to free.
9238 * @param[in] subs non-Zero to check for sub-DBs in this DB.
9239 * @return 0 on success, non-zero on failure.
9242 mdb_drop0(MDB_cursor *mc, int subs)
9246 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
9247 if (rc == MDB_SUCCESS) {
9248 MDB_txn *txn = mc->mc_txn;
9253 /* LEAF2 pages have no nodes, cannot have sub-DBs */
9254 if (IS_LEAF2(mc->mc_pg[mc->mc_top]))
9257 mdb_cursor_copy(mc, &mx);
9258 while (mc->mc_snum > 0) {
9259 MDB_page *mp = mc->mc_pg[mc->mc_top];
9260 unsigned n = NUMKEYS(mp);
9262 for (i=0; i<n; i++) {
9263 ni = NODEPTR(mp, i);
9264 if (ni->mn_flags & F_BIGDATA) {
9267 memcpy(&pg, NODEDATA(ni), sizeof(pg));
9268 rc = mdb_page_get(txn, pg, &omp, NULL);
9271 mdb_cassert(mc, IS_OVERFLOW(omp));
9272 rc = mdb_midl_append_range(&txn->mt_free_pgs,
9276 } else if (subs && (ni->mn_flags & F_SUBDATA)) {
9277 mdb_xcursor_init1(mc, ni);
9278 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
9284 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)
9286 for (i=0; i<n; i++) {
9288 ni = NODEPTR(mp, i);
9291 mdb_midl_xappend(txn->mt_free_pgs, pg);
9296 mc->mc_ki[mc->mc_top] = i;
9297 rc = mdb_cursor_sibling(mc, 1);
9299 if (rc != MDB_NOTFOUND)
9301 /* no more siblings, go back to beginning
9302 * of previous level.
9306 for (i=1; i<mc->mc_snum; i++) {
9308 mc->mc_pg[i] = mx.mc_pg[i];
9313 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
9316 txn->mt_flags |= MDB_TXN_ERROR;
9317 } else if (rc == MDB_NOTFOUND) {
9323 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
9325 MDB_cursor *mc, *m2;
9328 if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9331 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
9334 if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi))
9337 rc = mdb_cursor_open(txn, dbi, &mc);
9341 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
9342 /* Invalidate the dropped DB's cursors */
9343 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
9344 m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
9348 /* Can't delete the main DB */
9349 if (del && dbi > MAIN_DBI) {
9350 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0);
9352 txn->mt_dbflags[dbi] = DB_STALE;
9353 mdb_dbi_close(txn->mt_env, dbi);
9355 txn->mt_flags |= MDB_TXN_ERROR;
9358 /* reset the DB record, mark it dirty */
9359 txn->mt_dbflags[dbi] |= DB_DIRTY;
9360 txn->mt_dbs[dbi].md_depth = 0;
9361 txn->mt_dbs[dbi].md_branch_pages = 0;
9362 txn->mt_dbs[dbi].md_leaf_pages = 0;
9363 txn->mt_dbs[dbi].md_overflow_pages = 0;
9364 txn->mt_dbs[dbi].md_entries = 0;
9365 txn->mt_dbs[dbi].md_root = P_INVALID;
9367 txn->mt_flags |= MDB_TXN_DIRTY;
9370 mdb_cursor_close(mc);
9374 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
9376 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9379 txn->mt_dbxs[dbi].md_cmp = cmp;
9383 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
9385 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9388 txn->mt_dbxs[dbi].md_dcmp = cmp;
9392 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
9394 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9397 txn->mt_dbxs[dbi].md_rel = rel;
9401 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
9403 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9406 txn->mt_dbxs[dbi].md_relctx = ctx;
9411 mdb_env_get_maxkeysize(MDB_env *env)
9413 return ENV_MAXKEY(env);
9417 mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
9419 unsigned int i, rdrs;
9422 int rc = 0, first = 1;
9426 if (!env->me_txns) {
9427 return func("(no reader locks)\n", ctx);
9429 rdrs = env->me_txns->mti_numreaders;
9430 mr = env->me_txns->mti_readers;
9431 for (i=0; i<rdrs; i++) {
9433 txnid_t txnid = mr[i].mr_txnid;
9434 sprintf(buf, txnid == (txnid_t)-1 ?
9435 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n",
9436 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
9439 rc = func(" pid thread txnid\n", ctx);
9443 rc = func(buf, ctx);
9449 rc = func("(no active readers)\n", ctx);
9454 /** Insert pid into list if not already present.
9455 * return -1 if already present.
9458 mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
9460 /* binary search of pid in list */
9462 unsigned cursor = 1;
9464 unsigned n = ids[0];
9467 unsigned pivot = n >> 1;
9468 cursor = base + pivot + 1;
9469 val = pid - ids[cursor];
9474 } else if ( val > 0 ) {
9479 /* found, so it's a duplicate */
9488 for (n = ids[0]; n > cursor; n--)
9495 mdb_reader_check(MDB_env *env, int *dead)
9501 return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS;
9504 /** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */
9505 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
9507 mdb_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r);
9508 unsigned int i, j, rdrs;
9510 MDB_PID_T *pids, pid;
9511 int rc = MDB_SUCCESS, count = 0;
9513 rdrs = env->me_txns->mti_numreaders;
9514 pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
9518 mr = env->me_txns->mti_readers;
9519 for (i=0; i<rdrs; i++) {
9521 if (pid && pid != env->me_pid) {
9522 if (mdb_pid_insert(pids, pid) == 0) {
9523 if (!mdb_reader_pid(env, Pidcheck, pid)) {
9524 /* Stale reader found */
9527 if ((rc = LOCK_MUTEX0(rmutex)) != 0) {
9528 if ((rc = mdb_mutex_failed(env, rmutex, rc)))
9530 rdrs = 0; /* the above checked all readers */
9532 /* Recheck, a new process may have reused pid */
9533 if (mdb_reader_pid(env, Pidcheck, pid))
9538 if (mr[j].mr_pid == pid) {
9539 DPRINTF(("clear stale reader pid %u txn %"Z"d",
9540 (unsigned) pid, mr[j].mr_txnid));
9545 UNLOCK_MUTEX(rmutex);
9556 #ifdef MDB_ROBUST_SUPPORTED
9557 /** Handle #LOCK_MUTEX0() failure.
9558 * With #MDB_ROBUST, try to repair the lock file if the mutex owner died.
9559 * @param[in] env the environment handle
9560 * @param[in] mutex LOCK_MUTEX0() mutex
9561 * @param[in] rc LOCK_MUTEX0() error (nonzero)
9562 * @return 0 on success with the mutex locked, or an error code on failure.
9564 static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc)
9566 int toggle, rlocked, rc2;
9568 enum { WAIT_ABANDONED = EOWNERDEAD };
9571 if (rc == (int) WAIT_ABANDONED) {
9572 /* We own the mutex. Clean up after dead previous owner. */
9574 rlocked = (mutex == MDB_MUTEX(env, r));
9576 /* Keep mti_txnid updated, otherwise next writer can
9577 * overwrite data which latest meta page refers to.
9579 toggle = mdb_env_pick_meta(env);
9580 env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
9581 /* env is hosed if the dead thread was ours */
9583 env->me_flags |= MDB_FATAL_ERROR;
9588 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
9589 (rc ? "this process' env is hosed" : "recovering")));
9590 rc2 = mdb_reader_check0(env, rlocked, NULL);
9592 rc2 = pthread_mutex_consistent(mutex);
9593 if (rc || (rc = rc2)) {
9594 DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc)));
9595 UNLOCK_MUTEX(mutex);
9601 DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc)));
9606 #endif /* MDB_ROBUST_SUPPORTED */