2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
8 * This utility choses a reasonable pagesize for a BTREE database.
10 * Here we assume that:
11 * 1) This set of records are already in a BTREE database, which may been
12 * configured with a unreasonable page size.
13 * 2) Treat the database as if it were compacted.
14 * 3) The goal is to optimize the database for the current content,
15 * rather than for ongoing insertions.
17 * The page size of a BTREE can be 512, 1024, 2048, 4096, 8192, 16384,
18 * 32768, 65536, totally 8 different cases. So we have 8 possible BTREE,
19 * each with different pagesize to contain them. Without actually creating
20 * those 8 databases, this utility tries to simulate situations, that is,
21 * for each pagesize, how many leaf pages, over flow pages and duplicate
22 * pages are needed, and what's the distribution of each kind of pages
23 * based on their fill factor.
25 * db_tuner contains 2 parts:
27 * I) Simulation of 8 different pagesized databases.
28 * This includes, the number of leaf pages, overflow pages caused by
29 * big key/data in leaf layers, and duplicate pages, the distribution
30 * of each kind of pages in different fill factor ranges.
32 * This is achieved by retrieving those records from existing btree and
33 * inserting them into different kind of pages. Since the records from
34 * the btree are sorted, they are inserted into the end of each page.
35 * If this page become full, that is no enough space, only this new
36 * record will be put into next new page.
38 * II) Recommend the best page size.
39 * From our simulation results, this utility choose a page size based on
40 * the number of overflow pages and storage (on-disk space).
41 * If there is no overflow pages, then choose the one resulting in
42 * the smallest storage as the recommended page size. Otherwise,
43 * choose the one that results in a reasonable small number of overflow pages.
46 #include "db_config.h"
52 #include "dbinc/db_page.h"
53 #include "dbinc/btree.h"
56 static const char copyright[] =
57 "Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.\n";
61 * Fill factor distribution division,
62 * e.g., [0.000 - 0.099], ..., [0.900 - 0.999], [1.000- 1.000]
64 #define DIST_DIVISION 11
66 /* Error return code in db_tuner, different with those in src\dbinc\db.in. */
67 /* Dist >= DIST_DIVISION. */
68 #define EXIT_DIST_OUTRANGE (-31000)
69 /* "Insert" zero needed. */
70 #define EXIT_INSERT_ZERO_NEED (-31001)
71 /* On-page duplicate set = 0. */
72 #define EXIT_INSERT_ZERO_ONPGDUP (-31002)
74 /* Follows are some special "insert" types based on the data. */
75 /* Insert as normal case. */
76 #define INSERT_NORMAL 0x0001
77 /* Nothing is inserted. */
78 #define INSERT_NOTHING 0x0002
79 /* No key but a slot is inserted. */
80 #define INSERT_SLOT 0x0003
81 /* A B_DUPLICATE point is inserted. */
82 #define INSERT_BDUPLICATE 0x0004
85 * Page size of BTREE can be from DB_MIN_PGSIZE (512) to DB_MAX_PGSIZE(64K),
86 * and all the pagesize is the power of 2, so we have 8 possible cases.
88 * 8 is from ((int)(log(DB_MAX_PGSIZE) - log(DB_MIN_PGSIZE) + 1)).
92 /* Structure used to store statistics of the assessment. */
93 typedef struct __tuner_ff_stat {
94 uintmax_t pgsize_leaf_dist[NUM_PGSIZES][DIST_DIVISION];
95 uintmax_t pgsize_ovfl_dist[NUM_PGSIZES][DIST_DIVISION];
96 uintmax_t pgsize_dup_dist[NUM_PGSIZES][DIST_DIVISION];
98 /* Info used to track stats across page in a traverse. */
99 u_int32_t pg_leaf_offset[NUM_PGSIZES];
100 u_int32_t pg_dup_offset[NUM_PGSIZES];
103 static int __tuner_analyze_btree __P((DB_ENV *, DB *, u_int32_t));
104 static int __tuner_ff_stat_callback __P((DBC *, PAGE *, void *, int *));
105 static int __tuner_generate_fillfactor_stats __P((DB_ENV *, DB *,
107 static int __tuner_insert_dupdata __P((DB *, u_int32_t, int, TUNER_FF_STAT *));
108 static int __tuner_insert_kvpair __P((DB *, u_int32_t, u_int32_t, int, int,
109 int, TUNER_FF_STAT *));
110 static int __tuner_leaf_page __P((DBC *, PAGE *, TUNER_FF_STAT *));
111 static int __tuner_leaf_dupdata __P((DBC*, PAGE *, int, int, u_int32_t,
113 static int __tuner_leaf_dupdata_entries __P((DBC *, PAGE *, int, int, int, int,
115 static int __tuner_opd_data_entries __P((DBC *, PAGE *, int, int,
117 static int __tuner_opd_data __P((DBC *, PAGE *, int, int, TUNER_FF_STAT *));
118 static int __tuner_opd_page __P((DBC *, PAGE *, TUNER_FF_STAT *));
119 static int __tuner_print_btree_fillfactor __P((u_int32_t, TUNER_FF_STAT *));
120 static int __tuner_record_dup_pg __P((int, TUNER_FF_STAT *));
121 static int __tuner_record_last_opd __P((int, TUNER_FF_STAT *));
122 static int __tuner_record_leaf_pg __P((int, TUNER_FF_STAT *));
123 static int __tuner_record_ovfl_pg __P((u_int32_t, int, TUNER_FF_STAT *));
125 static int get_opd_size __P((DBC*, PAGE*, u_int32_t*));
126 static int item_size __P((DB *, PAGE *, db_indx_t));
127 static int item_space __P((DB *, PAGE *, db_indx_t));
128 int main __P((int, char *[]));
129 static int open_db __P((DB **, DB_ENV *, char *, char *));
130 static int sum_opd_page_data_entries __P((DB *, PAGE *));
131 static int usage __P((void));
132 static int version_check __P((void));
134 const char *progname = "db_tuner";
145 char *dbname, *home, *subdb;
146 int ch, is_set_dbfile, ret;
147 u_int32_t cachesize, verbose;
149 if ((ret = version_check()) != 0)
155 dbname = home = subdb = NULL;
156 is_set_dbfile = verbose = 0;
159 while ((ch = getopt(argc, argv, "c:d:h:vs:")) != EOF)
162 cachesize = atoi(optarg);
181 /* Handle possible interruptions. */
187 if ((ret = db_env_create(&dbenv, 0)) != 0) {
188 fprintf(stderr, "%s: db_env_create: %s\n",
189 progname, db_strerror(ret));
193 dbenv->set_errfile(dbenv, stderr);
194 dbenv->set_errpfx(dbenv, progname);
196 if ((cachesize != 0) && (ret =
197 dbenv->set_cachesize(dbenv, (u_int32_t)0, cachesize, 1)) != 0) {
198 dbenv->err(dbenv, ret, "DB_ENV->set_cachesize:");
203 * If attaching to a pre-existing environment fails, create a
204 * private one and try again.
206 if ((ret = dbenv->open(dbenv, home, DB_USE_ENVIRON, 0)) != 0 &&
207 (ret == DB_VERSION_MISMATCH || (ret = dbenv->open(dbenv, home,
208 DB_CREATE | DB_INIT_MPOOL | DB_USE_ENVIRON | DB_PRIVATE,
210 dbenv->err(dbenv, ret, "DB_ENV->open:");
214 if ((ret = open_db(&dbp, dbenv, dbname, subdb)) != 0) {
215 dbenv->err(dbenv, ret, "open_db:");
219 if ((ret = dbp->get_type(dbp, &dbtype)) != 0) {
220 dbenv->err(dbenv, ret, "DB->get_type:");
226 if ((ret = __tuner_analyze_btree(dbenv, dbp, verbose)) != 0)
227 dbenv->err(dbenv, ret, "__tuner_analyze_btree fails.");
230 dbenv->errx(dbenv, DB_STR("5001",
231 "%s: Unsupported database type"), progname);
235 if (dbp != NULL && (ret = dbp->close(dbp, 0)) != 0)
236 dbenv->err(dbenv, ret, "DB->close: %s", dbname);
238 if (dbenv != NULL && (ret = dbenv->close(dbenv, 0)) != 0)
239 fprintf(stderr, "%s: dbenv->close: %s", progname,
242 /* Resend any caught signal. */
243 __db_util_sigresend();
245 return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
249 * Generate the simulated statistics for each different btree pagesize,
250 * then print out this information if verbose enabled, finally make our
251 * recommendation of our best pagesize based on our simulated results.
254 __tuner_analyze_btree(dbenv, dbp, verbose)
262 memset(&stats, 0, sizeof(TUNER_FF_STAT));
264 if ((ret = __tuner_generate_fillfactor_stats(dbenv, dbp,
266 dbenv->err(dbenv, ret,
267 "__tuner_generate_fillfactor_stats fails.");
271 (void)__tuner_print_btree_fillfactor(verbose, &stats);
273 return (EXIT_SUCCESS);
276 /* Traverse the database to gather simulated statistics for each pagesize.*/
278 __tuner_generate_fillfactor_stats(dbenv, dbp, stats)
281 TUNER_FF_STAT *stats;
288 if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) {
289 dbenv->err(dbenv, ret, "DB_ENV->cursor:");
294 * Call the internal Berkeley DB function, that triggers a callback
295 * for each page in a btree database.
297 if ((ret = __bam_traverse(dbc, DB_LOCK_READ, PGNO_INVALID,
298 __tuner_ff_stat_callback, (void *)stats)) != 0) {
299 dbenv->err(dbenv, ret, "__bam_traverse:");
304 * Record the last simulated page for leaf and dup page,
305 * which ensure at least one page is used.
307 for (i = 0; i < NUM_PGSIZES; ++i) {
308 if (stats->pg_leaf_offset[i] > 0 &&
309 (ret = __tuner_record_leaf_pg(i, stats)) != 0) {
310 dbenv->err(dbenv, ret, "__tuner_record_leaf");
314 if (stats->pg_dup_offset[i] > 0 &&
315 (ret = __tuner_record_dup_pg(i, stats)) != 0) {
316 dbenv->err(dbenv, ret, "__tuner_record_dup_pg");
322 if(dbc != NULL && (t_ret = dbc->close(dbc)) != 0)
323 dbenv->err(dbenv, t_ret, "DBC->close:");
325 if (ret == 0 && t_ret != 0)
332 * This callback is used in __bam_traverse. When traversing each page in
333 * the BTREE, it retrieves each record for simulation.
336 __tuner_ff_stat_callback(dbc, h, cookie, putp)
350 if ((ret = __tuner_leaf_page(dbc, h, cookie)) != 0) {
351 dbenv->err(dbenv, ret, "__tuner_leaf_page");
357 /* Coming a new off-page duplicate set.*/
358 if (h->prev_pgno == PGNO_INVALID &&
359 (ret = __tuner_opd_page(dbc, h, cookie)) != 0) {
360 dbenv->err(dbenv, ret, "__tuner_opd_page:");
369 return (EXIT_FAILURE);
372 return (EXIT_SUCCESS);
376 * Deal with the leaf page of existing database. This includes:
377 * 1: determine the on-page duplicate set, and calculate its total size
378 * 2: decise where should this set go (on-page or off-page) in the later
379 * simulation stage and do some "movement".
380 * 3: "move" the unique key data pairs to the simulated leaf pages.
383 __tuner_leaf_page(dbc, h, cookie)
386 TUNER_FF_STAT *cookie;
390 db_indx_t findx, lindx, indx, *inp, top;
391 u_int32_t data_sz, key_sz, onpd_sz;
392 int i, ret, in_data_type;
397 * Use some macros from db_page.h to retrieve information from the
398 * page. P_INP retrieves the offset to the start of page index array.
399 * NUM_ENT retrieves the number of items on the page.
405 for (indx = 0; indx < top;) {
407 * If on-page duplicate, first calculate the total size,
408 * including one key and all data.
411 if ((indx + P_INDX) < top && inp[indx] == inp[indx + P_INDX]) {
413 /* Ignore deleted items. */
414 if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
417 /* Count the key once. */
418 onpd_sz += item_space(dbp, h, indx);
421 indx < top && inp[findx] == inp[indx];
423 /* Ignore deleted items. */
424 if (B_DISSET(GET_BKEYDATA(dbp, h,
425 indx + O_INDX)->type))
427 /* Count all the data items. */
428 onpd_sz += item_space(dbp, h, indx + O_INDX);
431 /*Indx range of on-page duplicate set: [findx, lindx)*/
435 return (EXIT_INSERT_ZERO_ONPGDUP);
437 /* "Move" on-page duplicate set to simualted pages.*/
438 if ((ret = __tuner_leaf_dupdata(dbc, h, findx, lindx,
439 onpd_sz, cookie)) != 0) {
440 dbenv->err(dbenv, ret, "__tuner_leaf_dupdata");
444 in_data_type = INSERT_NORMAL;
445 /* Ignore deleted items. */
446 if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
449 /* First consider key. */
450 key_sz = item_size(dbp, h, indx);
452 /* Ignore deleted items. */
453 if (B_DISSET(GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
456 /* next consider data.*/
457 if (B_TYPE(GET_BKEYDATA(dbp, h,
458 indx + O_INDX)->type) == B_DUPLICATE) {
460 * Off-page duplicate set is not handled here
461 * but on the duplicate pages.
462 * Here the key is inserted into "simulated"
465 in_data_type = INSERT_NOTHING;
468 data_sz = item_size(dbp, h, indx + O_INDX);
470 for (i = 0; i < NUM_PGSIZES; ++i) {
471 if ((ret = __tuner_insert_kvpair(dbp, key_sz,
472 data_sz, i, INSERT_NORMAL, in_data_type,
474 dbenv->err(dbenv, ret,
475 "__tuner_insert_kvpair");
488 * "Move" the on-page duplicate data set from the specific page to our
489 * simulated databases (Indx range of this on-page duplicate set:
490 * [findx, lindx)), it includes following steps:
492 * First check where should this set go (on-page or off-page duplicate tree).
494 * This is determined as "If total size of duplicate data set is more than 25%
495 * of a specific page size, then this set go to off-page duplicate tree.
496 * Otherwise, it goes to on-page duplicate. "
498 * Then "move" this duplicate set to our simulated pages in each simulated
502 __tuner_leaf_dupdata(dbc, h, findx, lindx, dup_sz, cookie)
507 TUNER_FF_STAT *cookie;
515 for (i = 0; i < NUM_PGSIZES; ++i) {
516 pgsize = (1 << i) * DB_MIN_PGSIZE;
518 /* Check whether this duplicate set go to opd? */
519 is_opd = (dup_sz < (pgsize / 4)) ? 0 : 1;
521 /* "Move" this on-page duplicate to our simulated pages. */
522 if ((ret = __tuner_leaf_dupdata_entries(dbc, h, findx,
523 lindx, i, is_opd, cookie)) != 0) {
524 dbenv->err(dbenv, ret, "__tuner_leaf_dupdata_entries");
529 * Record the last simulated duplicate pages for a finished
530 * off-page duplicate set then reset the offset to zero
534 (ret = __tuner_record_last_opd(i, cookie)) != 0) {
535 dbenv->err(dbenv, ret, "__tuner_record_last_opd");
540 return (EXIT_SUCCESS);
544 * "Move" the on-page duplicate set [findx, lindx) on the specific page to
545 * simulated database with pagesize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
548 __tuner_leaf_dupdata_entries(dbc, h, findx, lindx, indx_pgsz, is_opd, cookie)
551 int findx, lindx, indx_pgsz, is_opd;
552 TUNER_FF_STAT *cookie;
557 u_int32_t data_sz, key_sz;
558 int ret, in_key_type;
563 for (indx = findx; indx < lindx; indx += P_INDX) {
566 in_key_type = INSERT_SLOT;
568 * For on-page duplicate data, the key is inserted once,
569 * then its corresponding data.
572 /* Ignore deleted items. */
573 if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
576 key_sz = item_size(dbp, h, indx);
577 in_key_type = INSERT_NORMAL;
580 * If is_opd, then insert a key + B_DUPLICATE pair for
581 * this on-page duplicate to simulated leaf page.
582 * INSERT_BDUPLICATE: B_DUPLICATE point.
585 __tuner_insert_kvpair(dbp, key_sz, 0, indx_pgsz,
586 INSERT_NORMAL, INSERT_BDUPLICATE, cookie)) != 0) {
587 dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
592 /* Ignore deleted items. */
593 if (B_DISSET(GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
596 data_sz = item_size(dbp, h, indx + O_INDX);
599 ret = __tuner_insert_dupdata(dbp, data_sz,
602 dbenv->err(dbenv, ret,
603 "__tuner_insert_dupdata");
607 ret = __tuner_insert_kvpair(dbp, key_sz, data_sz,
608 indx_pgsz, in_key_type, INSERT_NORMAL,
612 dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
617 return (EXIT_SUCCESS);
620 /* Tuner the off-page duplicate pages from existing database. */
622 __tuner_opd_page(dbc, h, cookie)
625 TUNER_FF_STAT *cookie;
628 u_int32_t opd_sz, pgsize;
634 /* 1st calculate the total size of the duplicate set. */
635 if ((ret = get_opd_size(dbc, h, &opd_sz)) != 0) {
636 dbenv->err(dbenv, ret, "get_opd_size:");
640 /* 2nd insert this set into "simulated" pages for each page size.*/
641 for (i = 0; i < NUM_PGSIZES; ++i) {
642 pgsize = (1 << i) * DB_MIN_PGSIZE;
644 /* Check whether this duplicate set go to opd? */
645 is_opd = (opd_sz < (pgsize / 4)) ? 0 : 1;
647 if ((ret = __tuner_opd_data(dbc, h, i, is_opd, cookie)) != 0) {
648 dbenv->err(dbenv, ret, "__tuner_opd_data:");
656 /* "Move" all the off-page duplicate data into simulated on-page or off-page.*/
658 __tuner_opd_data(dbc, h, indx_pgsz, is_opd, cookie)
661 int indx_pgsz, is_opd;
662 TUNER_FF_STAT *cookie;
677 pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
680 * __tuner_leaf_page has inserted one key for each opd already,
681 * so here only a B_DUPLICATE point is inserted into simulate
682 * leaf page if this duplicate set goes to on-page.
685 ret = __tuner_insert_kvpair(dbp, 0, 0, indx_pgsz,
686 INSERT_NOTHING, INSERT_BDUPLICATE, cookie);
689 dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
694 /* Next insert all the data of this duplicate set. */
696 ret = __tuner_opd_data_entries(dbc, h, indx_pgsz, is_opd,
699 dbenv->err(dbenv, ret, "__tuner_opd_data_entries");
703 next_pgno = p->next_pgno;
705 if (p != h && (ret = mpf->put(mpf, p, dbc->priority, 0)) != 0) {
706 dbenv->err(dbenv, ret, "DB_MPOOLFILE->put:");
710 if (next_pgno == PGNO_INVALID)
713 if ((ret = mpf->get(mpf, &next_pgno, dbc->txn, 0, &p)) != 0) {
714 dbenv->err(dbenv, ret, "DB_MPOOLFILE->get:");
719 /* Record the last simulate duplicate page if goto off-page duplicate*/
720 if (is_opd && (ret = __tuner_record_last_opd(indx_pgsz, cookie)) != 0) {
721 dbenv->err(dbenv, ret, "__tuner_record_last_opd");
725 return (EXIT_SUCCESS);
729 * "Move" the off-page duplicate data set to our simulated on-page or
733 __tuner_opd_data_entries(dbc, h, indx_pgsz, is_opd, cookie)
736 int indx_pgsz, is_opd;
737 TUNER_FF_STAT *cookie;
748 for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
749 /* Ignore deleted items. */
750 if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
753 data_sz = item_size(dbp, h, indx);
756 ret = __tuner_insert_dupdata(dbp, data_sz,
760 dbenv->err(dbenv, ret,
761 "__tuner_insert_dupdata");
766 * __tuner_leaf_page has inserted one key for each
767 * opd already (this will insert moment later),
768 * so only data items and key slots are inserted.
770 ret = __tuner_insert_kvpair(dbp, 0, data_sz,
771 indx_pgsz, INSERT_SLOT, INSERT_NORMAL,
775 dbenv->err(dbenv, ret,
776 "__tuner_insert_kvpair");
781 return (EXIT_SUCCESS);
785 * Try to insert a key and data pair into simulated leaf pages.
786 * Key and data pairs are always stored (or referenced) on the same leaf page.
789 __tuner_insert_kvpair(dbp, key_sz, data_sz, indx_pgsz, in_key, in_data, stats)
791 u_int32_t key_sz, data_sz;
792 int indx_pgsz, in_key, in_data;
793 TUNER_FF_STAT *stats;
796 int is_big_data, is_big_key, ret;
797 u_int32_t needed, pgsize;
800 is_big_data = is_big_key = 0;
802 pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
804 if (key_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
807 if (data_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
811 needed += BOVERFLOW_PSIZE;
813 /* Add big key into ovfl pages. */
815 __tuner_record_ovfl_pg(key_sz, indx_pgsz, stats)) != 0) {
816 dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg:key_sz");
821 * key_sz = INSERT_SLOT indicates no key is inserted
822 * but a slot in the inp array, e.g., on-page duplicate.
823 * key_sz = INSERT_NOTHING indicates no key no slot is
826 if (in_key == INSERT_NOTHING)
828 else if (in_key == INSERT_SLOT)
829 needed += sizeof(db_indx_t);
830 else if (in_key == INSERT_NORMAL)
831 needed += BKEYDATA_PSIZE(key_sz);
835 needed += BOVERFLOW_PSIZE;
837 /* Add big data into ovfl pages. */
839 __tuner_record_ovfl_pg(data_sz, indx_pgsz, stats)) != 0) {
840 dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg");
845 * in_data = INSERT_BDUPLICATE indicates a B_DUPLICATE is
846 * inserted, e.g., off-page duplicate case.
847 * in_data = INSERT_NOTHING indicates nothing is inserted,
848 * happens when there is a key + B_DUPLICATE pair in
849 * __tuner_leaf_page, in which case, only the key is inserted
850 * but no data because the data will considered in
851 * __tuner_opd_page when an off-page
852 * duplicate set is coming.
854 if (in_data == INSERT_NOTHING)
856 else if (in_data == INSERT_BDUPLICATE)
857 needed += BOVERFLOW_PSIZE;
858 else if (in_data == INSERT_NORMAL)
859 needed += BKEYDATA_PSIZE(data_sz);
863 return (EXIT_INSERT_ZERO_NEED);
865 /* 1st leaf page, add overhead size. */
866 if (stats->pg_leaf_offset[indx_pgsz] == 0)
867 stats->pg_leaf_offset[indx_pgsz] = SIZEOF_PAGE;
869 if ((stats->pg_leaf_offset[indx_pgsz] + needed) > pgsize) {
870 /* No enough space, then record current page info. */
871 if ((ret = __tuner_record_leaf_pg(indx_pgsz, stats)) != 0) {
872 dbenv->err(dbenv, ret, "__tuner_record_leaf_pg");
876 /* Insert pair into new page. */
877 stats->pg_leaf_offset[indx_pgsz] = needed + SIZEOF_PAGE;
879 stats->pg_leaf_offset[indx_pgsz] += needed;
881 return (EXIT_SUCCESS);
884 /* Try to insert a duplicate data into simulated off duplicate pages. */
886 __tuner_insert_dupdata(dbp, data_sz, indx_pgsz, stats)
890 TUNER_FF_STAT *stats;
893 int is_big_data, ret;
894 u_int32_t needed, pgsize;
899 pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
901 if (data_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
905 needed = BOVERFLOW_PSIZE;
907 /* Add big data into ovfl pages. */
909 __tuner_record_ovfl_pg(data_sz, indx_pgsz, stats)) != 0) {
910 dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg");
914 needed += BKEYDATA_PSIZE(data_sz);
917 return (EXIT_INSERT_ZERO_NEED);
919 /* 1st opd page, add overhead size. */
920 if (stats->pg_dup_offset[indx_pgsz] == 0)
921 stats->pg_dup_offset[indx_pgsz] = SIZEOF_PAGE;
923 if ((stats->pg_dup_offset[indx_pgsz] + needed) > pgsize) {
924 /* no enough space then record current page info. */
925 if ((ret = __tuner_record_dup_pg(indx_pgsz, stats)) != 0) {
926 dbenv->err(dbenv, ret, "__tuner_record_dup_pg");
930 /* insert new item into new page. */
931 stats->pg_dup_offset[indx_pgsz] = needed + SIZEOF_PAGE;
933 stats->pg_dup_offset[indx_pgsz] += needed;
935 return (EXIT_SUCCESS);
938 /* Insert big item into simulated over flow pages. */
940 __tuner_record_ovfl_pg(size, indx_pgsz, stats)
943 TUNER_FF_STAT *stats;
945 u_int32_t pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
948 /* Update OVFLPAGE list: 1. Add to "full" ovfl pages.*/
949 stats->pgsize_ovfl_dist[indx_pgsz][DIST_DIVISION - 1] +=
950 (size / (pgsize - SIZEOF_PAGE));
951 /* Update OVFLPAGE list: 2. Add the remainder.*/
952 size = size % (pgsize - SIZEOF_PAGE);
953 dist = (int)(((double)(size + SIZEOF_PAGE) *
954 (DIST_DIVISION - 1)) / pgsize);
956 /* assert(dist < DIST_DIVISION); */
957 if (dist >= DIST_DIVISION)
958 return (EXIT_DIST_OUTRANGE);
960 ++stats->pgsize_ovfl_dist[indx_pgsz][dist];
962 return (EXIT_SUCCESS);
965 /* Record simulated leaf page if it has no space to contain new record. */
967 __tuner_record_leaf_pg(indx_pgsz, stats)
969 TUNER_FF_STAT *stats;
974 pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
976 /* First calculate its fill factor. */
977 dist = (int)(((double)stats->pg_leaf_offset[indx_pgsz] *
978 (DIST_DIVISION - 1)) / pgsize);
980 /* assert(dist < DIST_DIVISION); */
981 if (dist >= DIST_DIVISION)
982 return (EXIT_DIST_OUTRANGE);
984 /* Then add one page to its corresponding distribution. */
985 ++stats->pgsize_leaf_dist[indx_pgsz][dist];
987 return (EXIT_SUCCESS);
990 /* Record simulated duplicate page if it has no enough space for new record. */
992 __tuner_record_dup_pg(indx_pgsz, stats)
994 TUNER_FF_STAT *stats;
999 pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
1001 /* First calculate its fill factor. */
1002 dist = (int)(((double)stats->pg_dup_offset[indx_pgsz] *
1003 (DIST_DIVISION - 1)) / pgsize);
1005 /* assert(dist < DIST_DIVISION); */
1006 if (dist >= DIST_DIVISION)
1007 return (EXIT_DIST_OUTRANGE);
1009 /* Then add one page to its corresponding distribution. */
1010 ++stats->pgsize_dup_dist[indx_pgsz][dist];
1012 return (EXIT_SUCCESS);
1016 * Record the last simulated duplicate page when an off-page duplicate set
1017 * is finished, also reset its offset to be zero for next set.
1020 __tuner_record_last_opd(indx_pgsz, stats)
1022 TUNER_FF_STAT *stats;
1025 if (stats->pg_dup_offset[indx_pgsz] != 0 &&
1026 (ret = __tuner_record_dup_pg(indx_pgsz, stats)) != 0)
1029 /* Reset offset to zero for new opd set. */
1030 stats->pg_dup_offset[indx_pgsz] = 0;
1032 return (EXIT_SUCCESS);
1036 * When a new off-page duplicate set is coming, we first calculate its total
1037 * size, which will be used to determine whether this set should go to on-page
1038 * or off-page duplicate tree in our simulation part.
1040 * As a off-page duplicate set is in a linked pages, we simply traverse this
1041 * link and sum up all the size of each data in each page.
1044 get_opd_size(dbc, h, opd_sz)
1053 db_pgno_t next_pgno;
1065 dup_sz += sum_opd_page_data_entries(dbp, p);
1067 next_pgno = p->next_pgno;
1068 if (p != h && (ret =
1069 mpf->put(mpf, p, dbc->priority, 0)) != 0) {
1070 dbenv->err(dbenv, ret, "DB_MPOOLFILE->put:");
1074 if (next_pgno == PGNO_INVALID)
1078 mpf->get(mpf, &next_pgno, dbc->txn, 0, &p)) != 0) {
1079 dbenv->err(dbenv, ret, "DB_MPOOLFILE->get:");
1086 return (EXIT_SUCCESS);
1089 /* Sum up the space used to contain all the data in a specific page.*/
1091 sum_opd_page_data_entries(dbp, h)
1099 for (i = 0; i < NUM_ENT(h); i += O_INDX) {
1100 /* Ignore deleted items. */
1101 if (B_DISSET(GET_BKEYDATA(dbp, h, i)->type))
1103 sz += item_space(dbp, h, i);
1109 /* The space used by one item in a page. */
1111 item_space(dbp, h, indx)
1116 return (B_TYPE(GET_BKEYDATA(dbp, h, indx)->type) == B_KEYDATA ?
1117 BKEYDATA_PSIZE(GET_BKEYDATA(dbp, h, indx)->len) :
1118 BKEYDATA_PSIZE(GET_BOVERFLOW(dbp, h, indx)->tlen));
1121 /* The actual length of a item. */
1123 item_size(dbp, h, indx)
1128 return (B_TYPE(GET_BKEYDATA(dbp, h, indx)->type) == B_KEYDATA ?
1129 GET_BKEYDATA(dbp, h, indx)->len : GET_BOVERFLOW(dbp, h,
1133 /* Print out the information according to user's options. */
1135 __tuner_print_btree_fillfactor(verbose, stats)
1137 TUNER_FF_STAT *stats;
1139 const char * DIVIDE_LINE1 = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=";
1140 const char * DIVIDE_LINE2 = "-----------|";
1141 const char * DIVIDE_LINE3 = "---------------------------------------";
1143 int best_indx, i, j;
1145 u_int64_t minispace, ttpgcnt[NUM_PGSIZES], ttspace[NUM_PGSIZES];
1146 uintmax_t dup_cnt[NUM_PGSIZES], leaf_cnt[NUM_PGSIZES],
1147 ovfl_cnt[NUM_PGSIZES];
1149 shift_point = 0.099;
1151 minispace = UINT64_MAX;
1153 for (i = 0; i < NUM_PGSIZES; ++i) {
1154 pgsize = (1 << i) * DB_MIN_PGSIZE;
1155 ovfl_cnt[i] = leaf_cnt[i] = dup_cnt[i] = ttpgcnt[i] = 0;
1157 for (j = 0; j < DIST_DIVISION; ++j) {
1158 ovfl_cnt[i] += stats->pgsize_ovfl_dist[i][j];
1159 leaf_cnt[i] += stats->pgsize_leaf_dist[i][j];
1160 dup_cnt[i] += stats->pgsize_dup_dist[i][j];
1163 ttpgcnt[i] = ovfl_cnt[i] + leaf_cnt[i] + dup_cnt[i];
1164 ttspace[i] = pgsize * ttpgcnt[i];
1168 printf("\n %50s \n",
1169 "===========Simulation Results===========");
1170 printf("\n %s\n %s\n %s\n",
1171 "leaf_pg:\t percentage of leaf page in that range",
1172 "dup_pg:\t percentage of duplicate page in that range",
1173 "ovfl_pg:\t percentage of over flow page in that range");
1175 for (i = 0; i < NUM_PGSIZES; ++i) {
1176 printf("\n\n%s%s\n", DIVIDE_LINE1, DIVIDE_LINE1);
1177 printf("page size = %d\n", (1 << i) * DB_MIN_PGSIZE);
1178 printf("%s%s\n", DIVIDE_LINE1, DIVIDE_LINE1);
1179 printf("%s\n", DIVIDE_LINE3);
1180 printf("%s %s %s %s\n", "fill factor",
1181 "leaf_pg", "dup_pg", "ovfl_pg");
1183 for (j = 0; j < DIST_DIVISION; ++j) {
1184 if (j == (DIST_DIVISION - 1))
1185 shift_point = 0.000;
1187 shift_point = 0.099;
1189 printf("\n[%2.1f-%4.3f]\t",
1190 (double)j/(DIST_DIVISION - 1),
1191 ((double)j/(DIST_DIVISION - 1) +
1194 if (leaf_cnt[i] == 0 ||
1195 stats->pgsize_leaf_dist[i][j] == 0)
1196 printf("%3.2f\t", (double)0);
1198 printf("%3.2f%%\t", (double)
1199 (stats->pgsize_leaf_dist[i][j] *
1200 100) / leaf_cnt[i]);
1202 if (dup_cnt[i] == 0 ||
1203 stats->pgsize_dup_dist[i][j] == 0)
1204 printf("%3.2f\t", (double)0);
1206 printf("%3.2f%%\t", (double)
1207 (stats->pgsize_dup_dist[i][j] *
1210 if (ovfl_cnt[i] == 0 ||
1211 stats->pgsize_ovfl_dist[i][j] == 0)
1212 printf("%3.2f\t", (double)0);
1214 printf("%3.2f%%\t", (double)
1215 (stats->pgsize_ovfl_dist[i][j] *
1216 100) / ovfl_cnt[i]);
1220 printf("\n\n\n\n %55s\n\n",
1221 "=====Summary of simulated statistic=====");
1222 printf(" %s\n %s\n %s\n %s\n %s\n %s\n\n",
1223 "pgsize: \tpage size", "storage: \ton-disk space",
1224 "pgcnt: \ttotal number of all pages "
1225 "(e.g, sum of ovflcnt, leafcnt, dupcnt)",
1226 "ovflcnt: \tnumber of over flow pages",
1227 "leafcnt: \tnumber of leaf pages",
1228 "dupcnt: \tnumber of duplicate pages");
1229 printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
1230 DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
1231 printf(" %10s| %10s| %10s| %10s| %10s| %10s|\n", "pgsize",
1232 "storage", "pgcnt", "ovflcnt", "leafcnt", "dupcnt");
1233 printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
1234 DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
1235 for (i = 0; i < NUM_PGSIZES; ++i) {
1236 printf(" %10d|", (1 << i) * DB_MIN_PGSIZE);
1237 printf(" %10u|", (u_int32_t)ttspace[i]);
1238 if (ttspace[i] != (u_int32_t)ttspace[i])
1239 printf("(truncated value reported)");
1240 printf(" %10u|", (u_int32_t)ttpgcnt[i]);
1241 if (ttpgcnt[i] != (u_int32_t)ttpgcnt[i])
1242 printf("(truncated value reported)");
1243 printf(" %10u|", (u_int32_t)ovfl_cnt[i]);
1244 if (ovfl_cnt[i] != (u_int32_t)ovfl_cnt[i])
1245 printf("(truncated value reported)");
1246 printf(" %10u|", (u_int32_t)leaf_cnt[i]);
1247 if (leaf_cnt[i] != (u_int32_t)leaf_cnt[i])
1248 printf("(truncated value reported)");
1249 printf(" %10u|", (u_int32_t)dup_cnt[i]);
1250 if (dup_cnt[i] != (u_int32_t)dup_cnt[i])
1251 printf("(truncated value reported)");
1254 printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
1255 DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
1259 * Choose a page size based on the overflow calculation. If there
1260 * is no overflow consideration, then use the smallest on-disk
1261 * space as a recommended page size.
1263 if (ovfl_cnt[0] == 0) {
1264 minispace = ttspace[0];
1265 for (i = 1; i < NUM_PGSIZES; ++i)
1266 if ((ttspace[i] != 0) && (minispace > ttspace[i])) {
1267 minispace = ttspace[i];
1271 for (i = 1; i < NUM_PGSIZES; ++i)
1272 if ((ovfl_cnt[i - 1] - ovfl_cnt[i]) > 0.02 * ttpgcnt[i])
1275 printf("\n\nFor your input database, we recommend page size = %d \n \t"
1276 "out of 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536 for you.\n",
1277 (1 << best_indx) * DB_MIN_PGSIZE);
1279 return (EXIT_SUCCESS);
1282 /* Open the specific existing database. */
1284 open_db(dbpp, dbenv, dbname, subdb)
1293 if ((ret = db_create(&dbp, dbenv, 0)) != 0) {
1294 dbenv->err(dbenv, ret, "db_create fails.\n");
1300 /* Open a database for read-only.*/
1302 dbp->open(dbp, NULL, dbname, subdb, DB_UNKNOWN, DB_RDONLY, 0)) != 0)
1303 dbenv->err(dbenv, ret, "DB->open");
1308 /* Usage flag information to indicate what can user query for given database.*/
1312 fprintf(stderr, "usage: %s %s\n", progname,
1313 "[-c cachesize] -d file [-h home] [-s database] [-v verbose]");
1317 /*Check the verion of Berkeley DB libaray, make sure it is the right version.*/
1321 int v_major, v_minor, v_patch;
1323 /* Make sure we're loaded with the right version of the DB library. */
1324 (void)db_version(&v_major, &v_minor, &v_patch);
1325 if (v_major != DB_VERSION_MAJOR || v_minor != DB_VERSION_MINOR) {
1326 fprintf(stderr, DB_STR_A("5002",
1327 "%s: version %d.%d doesn't match library version %d.%d\n",
1328 "%s %d %d %d %d"), progname, DB_VERSION_MAJOR,
1329 DB_VERSION_MINOR, v_major, v_minor);
1331 return (EXIT_FAILURE);
1334 return (EXIT_SUCCESS);