src/third_party/sqlite/src/src/recover.c

   1 /*
   2 ** 2012 Jan 11
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 */
  11 /* TODO(shess): THIS MODULE IS STILL EXPERIMENTAL.  DO NOT USE IT. */
  12 /* Implements a virtual table "recover" which can be used to recover
  13  * data from a corrupt table.  The table is walked manually, with
  14  * corrupt items skipped.  Additionally, any errors while reading will
  15  * be skipped.
  16  *
  17  * Given a table with this definition:
  18  *
  19  * CREATE TABLE Stuff (
  20  *   name TEXT PRIMARY KEY,
  21  *   value TEXT NOT NULL
  22  * );
  23  *
  24  * to recover the data from teh table, you could do something like:
  25  *
  26  * -- Attach another database, the original is not trustworthy.
  27  * ATTACH DATABASE '/tmp/db.db' AS rdb;
  28  * -- Create a new version of the table.
  29  * CREATE TABLE rdb.Stuff (
  30  *   name TEXT PRIMARY KEY,
  31  *   value TEXT NOT NULL
  32  * );
  33  * -- This will read the original table's data.
  34  * CREATE VIRTUAL TABLE temp.recover_Stuff using recover(
  35  *   main.Stuff,
  36  *   name TEXT STRICT NOT NULL,  -- only real TEXT data allowed
  37  *   value TEXT STRICT NOT NULL
  38  * );
  39  * -- Corruption means the UNIQUE constraint may no longer hold for
  40  * -- Stuff, so either OR REPLACE or OR IGNORE must be used.
  41  * INSERT OR REPLACE INTO rdb.Stuff (rowid, name, value )
  42  *   SELECT rowid, name, value FROM temp.recover_Stuff;
  43  * DROP TABLE temp.recover_Stuff;
  44  * DETACH DATABASE rdb;
  45  * -- Move db.db to replace original db in filesystem.
  46  *
  47  *
  48  * Usage
  49  *
  50  * Given the goal of dealing with corruption, it would not be safe to
  51  * create a recovery table in the database being recovered.  So
  52  * recovery tables must be created in the temp database.  They are not
  53  * appropriate to persist, in any case.  [As a bonus, sqlite_master
  54  * tables can be recovered.  Perhaps more cute than useful, though.]
  55  *
  56  * The parameters are a specifier for the table to read, and a column
  57  * definition for each bit of data stored in that table.  The named
  58  * table must be convertable to a root page number by reading the
  59  * sqlite_master table.  Bare table names are assumed to be in
  60  * database 0 ("main"), other databases can be specified in db.table
  61  * fashion.
  62  *
  63  * Column definitions are similar to BUT NOT THE SAME AS those
  64  * provided to CREATE statements:
  65  *  column-def: column-name [type-name [STRICT] [NOT NULL]]
  66  *  type-name: (ANY|ROWID|INTEGER|FLOAT|NUMERIC|TEXT|BLOB)
  67  *
  68  * Only those exact type names are accepted, there is no type
  69  * intuition.  The only constraints accepted are STRICT (see below)
  70  * and NOT NULL.  Anything unexpected will cause the create to fail.
  71  *
  72  * ANY is a convenience to indicate that manifest typing is desired.
  73  * It is equivalent to not specifying a type at all.  The results for
  74  * such columns will have the type of the data's storage.  The exposed
  75  * schema will contain no type for that column.
  76  *
  77  * ROWID is used for columns representing aliases to the rowid
  78  * (INTEGER PRIMARY KEY, with or without AUTOINCREMENT), to make the
  79  * concept explicit.  Such columns are actually stored as NULL, so
  80  * they cannot be simply ignored.  The exposed schema will be INTEGER
  81  * for that column.
  82  *
  83  * NOT NULL causes rows with a NULL in that column to be skipped.  It
  84  * also adds NOT NULL to the column in the exposed schema.  If the
  85  * table has ever had columns added using ALTER TABLE, then those
  86  * columns implicitly contain NULL for rows which have not been
  87  * updated.  [Workaround using COALESCE() in your SELECT statement.]
  88  *
  89  * The created table is read-only, with no indices.  Any SELECT will
  90  * be a full-table scan, returning each valid row read from the
  91  * storage of the backing table.  The rowid will be the rowid of the
  92  * row from the backing table.  "Valid" means:
  93  * - The cell metadata for the row is well-formed.  Mainly this means that
  94  *   the cell header info describes a payload of the size indicated by
  95  *   the cell's payload size.
  96  * - The cell does not run off the page.
  97  * - The cell does not overlap any other cell on the page.
  98  * - The cell contains doesn't contain too many columns.
  99  * - The types of the serialized data match the indicated types (see below).
 100  *
 101  *
 102  * Type affinity versus type storage.
 103  *
 104  * http://www.sqlite.org/datatype3.html describes SQLite's type
 105  * affinity system.  The system provides for automated coercion of
 106  * types in certain cases, transparently enough that many developers
 107  * do not realize that it is happening.  Importantly, it implies that
 108  * the raw data stored in the database may not have the obvious type.
 109  *
 110  * Differences between the stored data types and the expected data
 111  * types may be a signal of corruption.  This module makes some
 112  * allowances for automatic coercion.  It is important to be concious
 113  * of the difference between the schema exposed by the module, and the
 114  * data types read from storage.  The following table describes how
 115  * the module interprets things:
 116  *
 117  * type     schema   data                     STRICT
 118  * ----     ------   ----                     ------
 119  * ANY      <none>   any                      any
 120  * ROWID    INTEGER  n/a                      n/a
 121  * INTEGER  INTEGER  integer                  integer
 122  * FLOAT    FLOAT    integer or float         float
 123  * NUMERIC  NUMERIC  integer, float, or text  integer or float
 124  * TEXT     TEXT     text or blob             text
 125  * BLOB     BLOB     blob                     blob
 126  *
 127  * type is the type provided to the recover module, schema is the
 128  * schema exposed by the module, data is the acceptable types of data
 129  * decoded from storage, and STRICT is a modification of that.
 130  *
 131  * A very loose recovery system might use ANY for all columns, then
 132  * use the appropriate sqlite3_column_*() calls to coerce to expected
 133  * types.  This doesn't provide much protection if a page from a
 134  * different table with the same column count is linked into an
 135  * inappropriate btree.
 136  *
 137  * A very tight recovery system might use STRICT to enforce typing on
 138  * all columns, preferring to skip rows which are valid at the storage
 139  * level but don't contain the right types.  Note that FLOAT STRICT is
 140  * almost certainly not appropriate, since integral values are
 141  * transparently stored as integers, when that is more efficient.
 142  *
 143  * Another option is to use ANY for all columns and inspect each
 144  * result manually (using sqlite3_column_*).  This should only be
 145  * necessary in cases where developers have used manifest typing (test
 146  * to make sure before you decide that you aren't using manifest
 147  * typing!).
 148  *
 149  *
 150  * Caveats
 151  *
 152  * Leaf pages not referenced by interior nodes will not be found.
 153  *
 154  * Leaf pages referenced from interior nodes of other tables will not
 155  * be resolved.
 156  *
 157  * Rows referencing invalid overflow pages will be skipped.
 158  *
 159  * SQlite rows have a header which describes how to interpret the rest
 160  * of the payload.  The header can be valid in cases where the rest of
 161  * the record is actually corrupt (in the sense that the data is not
 162  * the intended data).  This can especially happen WRT overflow pages,
 163  * as lack of atomic updates between pages is the primary form of
 164  * corruption I have seen in the wild.
 165  */
 166 /* The implementation is via a series of cursors.  The cursor
 167  * implementations follow the pattern:
 168  *
 169  * // Creates the cursor using various initialization info.
 170  * int cursorCreate(...);
 171  *
 172  * // Returns 1 if there is no more data, 0 otherwise.
 173  * int cursorEOF(Cursor *pCursor);
 174  *
 175  * // Various accessors can be used if not at EOF.
 176  *
 177  * // Move to the next item.
 178  * int cursorNext(Cursor *pCursor);
 179  *
 180  * // Destroy the memory associated with the cursor.
 181  * void cursorDestroy(Cursor *pCursor);
 182  *
 183  * References in the following are to sections at
 184  * http://www.sqlite.org/fileformat2.html .
 185  *
 186  * RecoverLeafCursor iterates the records in a leaf table node
 187  * described in section 1.5 "B-tree Pages".  When the node is
 188  * exhausted, an interior cursor is used to get the next leaf node,
 189  * and iteration continues there.
 190  *
 191  * RecoverInteriorCursor iterates the child pages in an interior table
 192  * node described in section 1.5 "B-tree Pages".  When the node is
 193  * exhausted, a parent interior cursor is used to get the next
 194  * interior node at the same level, and iteration continues there.
 195  *
 196  * Together these record the path from the leaf level to the root of
 197  * the tree.  Iteration happens from the leaves rather than the root
 198  * both for efficiency and putting the special case at the front of
 199  * the list is easier to implement.
 200  *
 201  * RecoverCursor uses a RecoverLeafCursor to iterate the rows of a
 202  * table, returning results via the SQLite virtual table interface.
 203  */
 204 /* TODO(shess): It might be useful to allow DEFAULT in types to
 205  * specify what to do for NULL when an ALTER TABLE case comes up.
 206  * Unfortunately, simply adding it to the exposed schema and using
 207  * sqlite3_result_null() does not cause the default to be generate.
 208  * Handling it ourselves seems hard, unfortunately.
 209  */
 210
 211 #include <assert.h>
 212 #include <ctype.h>
 213 #include <stdio.h>
 214 #include <string.h>
 215
 216 /* Internal SQLite things that are used:
 217  * u32, u64, i64 types.
 218  * Btree, Pager, and DbPage structs.
 219  * DbPage.pData, .pPager, and .pgno
 220  * sqlite3 struct.
 221  * sqlite3BtreePager() and sqlite3BtreeGetPageSize()
 222  * sqlite3PagerAcquire() and sqlite3PagerUnref()
 223  * getVarint().
 224  */
 225 #include "sqliteInt.h"
 226
 227 /* For debugging. */
 228 #if 0
 229 #define FNENTRY() fprintf(stderr, "In %s\n", __FUNCTION__)
 230 #else
 231 #define FNENTRY()
 232 #endif
 233
 234 /* Generic constants and helper functions. */
 235
 236 static const unsigned char kTableLeafPage = 0x0D;
 237 static const unsigned char kTableInteriorPage = 0x05;
 238
 239 /* From section 1.5. */
 240 static const unsigned kiPageTypeOffset = 0;
 241 static const unsigned kiPageFreeBlockOffset = 1;
 242 static const unsigned kiPageCellCountOffset = 3;
 243 static const unsigned kiPageCellContentOffset = 5;
 244 static const unsigned kiPageFragmentedBytesOffset = 7;
 245 static const unsigned knPageLeafHeaderBytes = 8;
 246 /* Interior pages contain an additional field. */
 247 static const unsigned kiPageRightChildOffset = 8;
 248 static const unsigned kiPageInteriorHeaderBytes = 12;
 249
 250 /* Accepted types are specified by a mask. */
 251 #define MASK_ROWID (1<<0)
 252 #define MASK_INTEGER (1<<1)
 253 #define MASK_FLOAT (1<<2)
 254 #define MASK_TEXT (1<<3)
 255 #define MASK_BLOB (1<<4)
 256 #define MASK_NULL (1<<5)
 257
 258 /* Helpers to decode fixed-size fields. */
 259 static u32 decodeUnsigned16(const unsigned char *pData){
 260   return (pData[0]<<8) + pData[1];
 261 }
 262 static u32 decodeUnsigned32(const unsigned char *pData){
 263   return (decodeUnsigned16(pData)<<16) + decodeUnsigned16(pData+2);
 264 }
 265 static i64 decodeSigned(const unsigned char *pData, unsigned nBytes){
 266   i64 r = (char)(*pData);
 267   while( --nBytes ){
 268     r <<= 8;
 269     r += *(++pData);
 270   }
 271   return r;
 272 }
 273 /* Derived from vdbeaux.c, sqlite3VdbeSerialGet(), case 7. */
 274 /* TODO(shess): Determine if swapMixedEndianFloat() applies. */
 275 static double decodeFloat64(const unsigned char *pData){
 276 #if !defined(NDEBUG)
 277   static const u64 t1 = ((u64)0x3ff00000)<<32;
 278   static const double r1 = 1.0;
 279   u64 t2 = t1;
 280   assert( sizeof(r1)==sizeof(t2) && memcmp(&r1, &t2, sizeof(r1))==0 );
 281 #endif
 282   i64 x = decodeSigned(pData, 8);
 283   double d;
 284   memcpy(&d, &x, sizeof(x));
 285   return d;
 286 }
 287
 288 /* Return true if a varint can safely be read from pData/nData. */
 289 /* TODO(shess): DbPage points into the middle of a buffer which
 290  * contains the page data before DbPage.  So code should always be
 291  * able to read a small number of varints safely.  Consider whether to
 292  * trust that or not.
 293  */
 294 static int checkVarint(const unsigned char *pData, unsigned nData){
 295   unsigned i;
 296
 297   /* In the worst case the decoder takes all 8 bits of the 9th byte. */
 298   if( nData>=9 ){
 299     return 1;
 300   }
 301
 302   /* Look for a high-bit-clear byte in what's left. */
 303   for( i=0; i<nData; ++i ){
 304     if( !(pData[i]&0x80) ){
 305       return 1;
 306     }
 307   }
 308
 309   /* Cannot decode in the space given. */
 310   return 0;
 311 }
 312
 313 /* Return 1 if n varints can be read from pData/nData. */
 314 static int checkVarints(const unsigned char *pData, unsigned nData,
 315                         unsigned n){
 316   unsigned nCur = 0;   /* Byte offset within current varint. */
 317   unsigned nFound = 0; /* Number of varints found. */
 318   unsigned i;
 319
 320   /* In the worst case the decoder takes all 8 bits of the 9th byte. */
 321   if( nData>=9*n ){
 322     return 1;
 323   }
 324
 325   for( i=0; nFound<n && i<nData; ++i ){
 326     nCur++;
 327     if( nCur==9 || !(pData[i]&0x80) ){
 328       nFound++;
 329       nCur = 0;
 330     }
 331   }
 332
 333   return nFound==n;
 334 }
 335
 336 /* ctype and str[n]casecmp() can be affected by locale (eg, tr_TR).
 337  * These versions consider only the ASCII space.
 338  */
 339 /* TODO(shess): It may be reasonable to just remove the need for these
 340  * entirely.  The module could require "TEXT STRICT NOT NULL", not
 341  * "Text Strict Not Null" or whatever the developer felt like typing
 342  * that day.  Handling corrupt data is a PERFECT place to be pedantic.
 343  */
 344 static int ascii_isspace(char c){
 345   /* From fts3_expr.c */
 346   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
 347 }
 348 static int ascii_isalnum(int x){
 349   /* From fts3_tokenizer1.c */
 350   return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
 351 }
 352 static int ascii_tolower(int x){
 353   /* From fts3_tokenizer1.c */
 354   return (x>='A' && x<='Z') ? x-'A'+'a' : x;
 355 }
 356 /* TODO(shess): Consider sqlite3_strnicmp() */
 357 static int ascii_strncasecmp(const char *s1, const char *s2, size_t n){
 358   const unsigned char *us1 = (const unsigned char *)s1;
 359   const unsigned char *us2 = (const unsigned char *)s2;
 360   while( *us1 && *us2 && n && ascii_tolower(*us1)==ascii_tolower(*us2) ){
 361     us1++, us2++, n--;
 362   }
 363   return n ? ascii_tolower(*us1)-ascii_tolower(*us2) : 0;
 364 }
 365 static int ascii_strcasecmp(const char *s1, const char *s2){
 366   /* If s2 is equal through strlen(s1), will exit while() due to s1's
 367    * trailing NUL, and return NUL-s2[strlen(s1)].
 368    */
 369   return ascii_strncasecmp(s1, s2, strlen(s1)+1);
 370 }
 371
 372 /* For some reason I kept making mistakes with offset calculations. */
 373 static const unsigned char *PageData(DbPage *pPage, unsigned iOffset){
 374   assert( iOffset<=pPage->nPageSize );
 375   return (unsigned char *)pPage->pData + iOffset;
 376 }
 377
 378 /* The first page in the file contains a file header in the first 100
 379  * bytes.  The page's header information comes after that.  Note that
 380  * the offsets in the page's header information are relative to the
 381  * beginning of the page, NOT the end of the page header.
 382  */
 383 static const unsigned char *PageHeader(DbPage *pPage){
 384   if( pPage->pgno==1 ){
 385     const unsigned nDatabaseHeader = 100;
 386     return PageData(pPage, nDatabaseHeader);
 387   }else{
 388     return PageData(pPage, 0);
 389   }
 390 }
 391
 392 /* Helper to fetch the pager and page size for the named database. */
 393 static int GetPager(sqlite3 *db, const char *zName,
 394                     Pager **pPager, unsigned *pnPageSize){
 395   Btree *pBt = NULL;
 396   int i;
 397   for( i=0; i<db->nDb; ++i ){
 398     if( ascii_strcasecmp(db->aDb[i].zName, zName)==0 ){
 399       pBt = db->aDb[i].pBt;
 400       break;
 401     }
 402   }
 403   if( !pBt ){
 404     return SQLITE_ERROR;
 405   }
 406
 407   *pPager = sqlite3BtreePager(pBt);
 408   *pnPageSize = sqlite3BtreeGetPageSize(pBt) - sqlite3BtreeGetReserve(pBt);
 409   return SQLITE_OK;
 410 }
 411
 412 /* iSerialType is a type read from a record header.  See "2.1 Record Format".
 413  */
 414
 415 /* Storage size of iSerialType in bytes.  My interpretation of SQLite
 416  * documentation is that text and blob fields can have 32-bit length.
 417  * Values past 2^31-12 will need more than 32 bits to encode, which is
 418  * why iSerialType is u64.
 419  */
 420 static u32 SerialTypeLength(u64 iSerialType){
 421   switch( iSerialType ){
 422     case 0 : return 0;  /* NULL */
 423     case 1 : return 1;  /* Various integers. */
 424     case 2 : return 2;
 425     case 3 : return 3;
 426     case 4 : return 4;
 427     case 5 : return 6;
 428     case 6 : return 8;
 429     case 7 : return 8;  /* 64-bit float. */
 430     case 8 : return 0;  /* Constant 0. */
 431     case 9 : return 0;  /* Constant 1. */
 432     case 10 : case 11 : assert( !"RESERVED TYPE"); return 0;
 433   }
 434   return (u32)((iSerialType>>1) - 6);
 435 }
 436
 437 /* True if iSerialType refers to a blob. */
 438 static int SerialTypeIsBlob(u64 iSerialType){
 439   assert( iSerialType>=12 );
 440   return (iSerialType%2)==0;
 441 }
 442
 443 /* Returns true if the serialized type represented by iSerialType is
 444  * compatible with the given type mask.
 445  */
 446 static int SerialTypeIsCompatible(u64 iSerialType, unsigned char mask){
 447   switch( iSerialType ){
 448     case 0  : return (mask&MASK_NULL)!=0;
 449     case 1  : return (mask&MASK_INTEGER)!=0;
 450     case 2  : return (mask&MASK_INTEGER)!=0;
 451     case 3  : return (mask&MASK_INTEGER)!=0;
 452     case 4  : return (mask&MASK_INTEGER)!=0;
 453     case 5  : return (mask&MASK_INTEGER)!=0;
 454     case 6  : return (mask&MASK_INTEGER)!=0;
 455     case 7  : return (mask&MASK_FLOAT)!=0;
 456     case 8  : return (mask&MASK_INTEGER)!=0;
 457     case 9  : return (mask&MASK_INTEGER)!=0;
 458     case 10 : assert( !"RESERVED TYPE"); return 0;
 459     case 11 : assert( !"RESERVED TYPE"); return 0;
 460   }
 461   return (mask&(SerialTypeIsBlob(iSerialType) ? MASK_BLOB : MASK_TEXT));
 462 }
 463
 464 /* Versions of strdup() with return values appropriate for
 465  * sqlite3_free().  malloc.c has sqlite3DbStrDup()/NDup(), but those
 466  * need sqlite3DbFree(), which seems intrusive.
 467  */
 468 static char *sqlite3_strndup(const char *z, unsigned n){
 469   char *zNew;
 470
 471   if( z==NULL ){
 472     return NULL;
 473   }
 474
 475   zNew = sqlite3_malloc(n+1);
 476   if( zNew!=NULL ){
 477     memcpy(zNew, z, n);
 478     zNew[n] = '\0';
 479   }
 480   return zNew;
 481 }
 482 static char *sqlite3_strdup(const char *z){
 483   if( z==NULL ){
 484     return NULL;
 485   }
 486   return sqlite3_strndup(z, strlen(z));
 487 }
 488
 489 /* Fetch the page number of zTable in zDb from sqlite_master in zDb,
 490  * and put it in *piRootPage.
 491  */
 492 static int getRootPage(sqlite3 *db, const char *zDb, const char *zTable,
 493                        u32 *piRootPage){
 494   char *zSql;  /* SQL selecting root page of named element. */
 495   sqlite3_stmt *pStmt;
 496   int rc;
 497
 498   if( strcmp(zTable, "sqlite_master")==0 ){
 499     *piRootPage = 1;
 500     return SQLITE_OK;
 501   }
 502
 503   zSql = sqlite3_mprintf("SELECT rootpage FROM %s.sqlite_master "
 504                          "WHERE type = 'table' AND tbl_name = %Q",
 505                          zDb, zTable);
 506   if( !zSql ){
 507     return SQLITE_NOMEM;
 508   }
 509
 510   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
 511   sqlite3_free(zSql);
 512   if( rc!=SQLITE_OK ){
 513     return rc;
 514   }
 515
 516   /* Require a result. */
 517   rc = sqlite3_step(pStmt);
 518   if( rc==SQLITE_DONE ){
 519     rc = SQLITE_CORRUPT;
 520   }else if( rc==SQLITE_ROW ){
 521     *piRootPage = sqlite3_column_int(pStmt, 0);
 522
 523     /* Require only one result. */
 524     rc = sqlite3_step(pStmt);
 525     if( rc==SQLITE_DONE ){
 526       rc = SQLITE_OK;
 527     }else if( rc==SQLITE_ROW ){
 528       rc = SQLITE_CORRUPT;
 529     }
 530   }
 531   sqlite3_finalize(pStmt);
 532   return rc;
 533 }
 534
 535 static int getEncoding(sqlite3 *db, const char *zDb, int* piEncoding){
 536   sqlite3_stmt *pStmt;
 537   int rc;
 538   char *zSql = sqlite3_mprintf("PRAGMA %s.encoding", zDb);
 539   if( !zSql ){
 540     return SQLITE_NOMEM;
 541   }
 542
 543   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
 544   sqlite3_free(zSql);
 545   if( rc!=SQLITE_OK ){
 546     return rc;
 547   }
 548
 549   /* Require a result. */
 550   rc = sqlite3_step(pStmt);
 551   if( rc==SQLITE_DONE ){
 552     /* This case should not be possible. */
 553     rc = SQLITE_CORRUPT;
 554   }else if( rc==SQLITE_ROW ){
 555     if( sqlite3_column_type(pStmt, 0)==SQLITE_TEXT ){
 556       const char* z = (const char *)sqlite3_column_text(pStmt, 0);
 557       /* These strings match the literals in pragma.c. */
 558       if( !strcmp(z, "UTF-16le") ){
 559         *piEncoding = SQLITE_UTF16LE;
 560       }else if( !strcmp(z, "UTF-16be") ){
 561         *piEncoding = SQLITE_UTF16BE;
 562       }else if( !strcmp(z, "UTF-8") ){
 563         *piEncoding = SQLITE_UTF8;
 564       }else{
 565         /* This case should not be possible. */
 566         *piEncoding = SQLITE_UTF8;
 567       }
 568     }else{
 569       /* This case should not be possible. */
 570       *piEncoding = SQLITE_UTF8;
 571     }
 572
 573     /* Require only one result. */
 574     rc = sqlite3_step(pStmt);
 575     if( rc==SQLITE_DONE ){
 576       rc = SQLITE_OK;
 577     }else if( rc==SQLITE_ROW ){
 578       /* This case should not be possible. */
 579       rc = SQLITE_CORRUPT;
 580     }
 581   }
 582   sqlite3_finalize(pStmt);
 583   return rc;
 584 }
 585
 586 /* Cursor for iterating interior nodes.  Interior page cells contain a
 587  * child page number and a rowid.  The child page contains items left
 588  * of the rowid (less than).  The rightmost page of the subtree is
 589  * stored in the page header.
 590  *
 591  * interiorCursorDestroy - release all resources associated with the
 592  *                         cursor and any parent cursors.
 593  * interiorCursorCreate - create a cursor with the given parent and page.
 594  * interiorCursorEOF - returns true if neither the cursor nor the
 595  *                     parent cursors can return any more data.
 596  * interiorCursorNextPage - fetch the next child page from the cursor.
 597  *
 598  * Logically, interiorCursorNextPage() returns the next child page
 599  * number from the page the cursor is currently reading, calling the
 600  * parent cursor as necessary to get new pages to read, until done.
 601  * SQLITE_ROW if a page is returned, SQLITE_DONE if out of pages,
 602  * error otherwise.  Unfortunately, if the table is corrupted
 603  * unexpected pages can be returned.  If any unexpected page is found,
 604  * leaf or otherwise, it is returned to the caller for processing,
 605  * with the interior cursor left empty.  The next call to
 606  * interiorCursorNextPage() will recurse to the parent cursor until an
 607  * interior page to iterate is returned.
 608  *
 609  * Note that while interiorCursorNextPage() will refuse to follow
 610  * loops, it does not keep track of pages returned for purposes of
 611  * preventing duplication.
 612  *
 613  * Note that interiorCursorEOF() could return false (not at EOF), and
 614  * interiorCursorNextPage() could still return SQLITE_DONE.  This
 615  * could happen if there are more cells to iterate in an interior
 616  * page, but those cells refer to invalid pages.
 617  */
 618 typedef struct RecoverInteriorCursor RecoverInteriorCursor;
 619 struct RecoverInteriorCursor {
 620   RecoverInteriorCursor *pParent; /* Parent node to this node. */
 621   DbPage *pPage;                  /* Reference to leaf page. */
 622   unsigned nPageSize;             /* Size of page. */
 623   unsigned nChildren;             /* Number of children on the page. */
 624   unsigned iChild;                /* Index of next child to return. */
 625 };
 626
 627 static void interiorCursorDestroy(RecoverInteriorCursor *pCursor){
 628   /* Destroy all the cursors to the root. */
 629   while( pCursor ){
 630     RecoverInteriorCursor *p = pCursor;
 631     pCursor = pCursor->pParent;
 632
 633     if( p->pPage ){
 634       sqlite3PagerUnref(p->pPage);
 635       p->pPage = NULL;
 636     }
 637
 638     memset(p, 0xA5, sizeof(*p));
 639     sqlite3_free(p);
 640   }
 641 }
 642
 643 /* Internal helper.  Reset storage in preparation for iterating pPage. */
 644 static void interiorCursorSetPage(RecoverInteriorCursor *pCursor,
 645                                   DbPage *pPage){
 646   assert( PageHeader(pPage)[kiPageTypeOffset]==kTableInteriorPage );
 647
 648   if( pCursor->pPage ){
 649     sqlite3PagerUnref(pCursor->pPage);
 650     pCursor->pPage = NULL;
 651   }
 652   pCursor->pPage = pPage;
 653   pCursor->iChild = 0;
 654
 655   /* A child for each cell, plus one in the header. */
 656   pCursor->nChildren = decodeUnsigned16(PageHeader(pPage) +
 657                                         kiPageCellCountOffset) + 1;
 658
 659   /* Each child requires a 16-bit offset from an array after the header,
 660    * and each child contains a 32-bit page number and at least a varint
 661    * (min size of one byte).  The final child page is in the header.  So
 662    * the maximum value for nChildren is:
 663    *   (nPageSize - kiPageInteriorHeaderBytes) /
 664    *      (sizeof(uint16) + sizeof(uint32) + 1) + 1
 665    */
 666   /* TODO(shess): This count is very unlikely to be corrupted in
 667    * isolation, so seeing this could signal to skip the page.  OTOH, I
 668    * can't offhand think of how to get here unless this or the page-type
 669    * byte is corrupted.  Could be an overflow page, but it would require
 670    * a very large database.
 671    */
 672   const unsigned knMinCellLength = 2 + 4 + 1;
 673   unsigned nMaxChildren =
 674       (pCursor->nPageSize - kiPageInteriorHeaderBytes) / knMinCellLength + 1;
 675   if (pCursor->nChildren > nMaxChildren) {
 676     pCursor->nChildren = nMaxChildren;
 677   }
 678 }
 679
 680 static int interiorCursorCreate(RecoverInteriorCursor *pParent,
 681                                 DbPage *pPage, int nPageSize,
 682                                 RecoverInteriorCursor **ppCursor){
 683   RecoverInteriorCursor *pCursor =
 684     sqlite3_malloc(sizeof(RecoverInteriorCursor));
 685   if( !pCursor ){
 686     return SQLITE_NOMEM;
 687   }
 688
 689   memset(pCursor, 0, sizeof(*pCursor));
 690   pCursor->pParent = pParent;
 691   pCursor->nPageSize = nPageSize;
 692   interiorCursorSetPage(pCursor, pPage);
 693   *ppCursor = pCursor;
 694   return SQLITE_OK;
 695 }
 696
 697 /* Internal helper.  Return the child page number at iChild. */
 698 static unsigned interiorCursorChildPage(RecoverInteriorCursor *pCursor){
 699   const unsigned char *pPageHeader;  /* Header of the current page. */
 700   const unsigned char *pCellOffsets; /* Offset to page's cell offsets. */
 701   unsigned iCellOffset;              /* Offset of target cell. */
 702
 703   assert( pCursor->iChild<pCursor->nChildren );
 704
 705   /* Rightmost child is in the header. */
 706   pPageHeader = PageHeader(pCursor->pPage);
 707   if( pCursor->iChild==pCursor->nChildren-1 ){
 708     return decodeUnsigned32(pPageHeader + kiPageRightChildOffset);
 709   }
 710
 711   /* Each cell is a 4-byte integer page number and a varint rowid
 712    * which is greater than the rowid of items in that sub-tree (this
 713    * module ignores ordering). The offset is from the beginning of the
 714    * page, not from the page header.
 715    */
 716   pCellOffsets = pPageHeader + kiPageInteriorHeaderBytes;
 717   iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iChild*2);
 718   if( iCellOffset<=pCursor->nPageSize-4 ){
 719     return decodeUnsigned32(PageData(pCursor->pPage, iCellOffset));
 720   }
 721
 722   /* TODO(shess): Check for cell overlaps?  Cells require 4 bytes plus
 723    * a varint.  Check could be identical to leaf check (or even a
 724    * shared helper testing for "Cells starting in this range"?).
 725    */
 726
 727   /* If the offset is broken, return an invalid page number. */
 728   return 0;
 729 }
 730
 731 static int interiorCursorEOF(RecoverInteriorCursor *pCursor){
 732   /* Find a parent with remaining children.  EOF if none found. */
 733   while( pCursor && pCursor->iChild>=pCursor->nChildren ){
 734     pCursor = pCursor->pParent;
 735   }
 736   return pCursor==NULL;
 737 }
 738
 739 /* Internal helper.  Used to detect if iPage would cause a loop. */
 740 static int interiorCursorPageInUse(RecoverInteriorCursor *pCursor,
 741                                    unsigned iPage){
 742   /* Find any parent using the indicated page. */
 743   while( pCursor && pCursor->pPage->pgno!=iPage ){
 744     pCursor = pCursor->pParent;
 745   }
 746   return pCursor!=NULL;
 747 }
 748
 749 /* Get the next page from the interior cursor at *ppCursor.  Returns
 750  * SQLITE_ROW with the page in *ppPage, or SQLITE_DONE if out of
 751  * pages, or the error SQLite returned.
 752  *
 753  * If the tree is uneven, then when the cursor attempts to get a new
 754  * interior page from the parent cursor, it may get a non-interior
 755  * page.  In that case, the new page is returned, and *ppCursor is
 756  * updated to point to the parent cursor (this cursor is freed).
 757  */
 758 /* TODO(shess): I've tried to avoid recursion in most of this code,
 759  * but this case is more challenging because the recursive call is in
 760  * the middle of operation.  One option for converting it without
 761  * adding memory management would be to retain the head pointer and
 762  * use a helper to "back up" as needed.  Another option would be to
 763  * reverse the list during traversal.
 764  */
 765 static int interiorCursorNextPage(RecoverInteriorCursor **ppCursor,
 766                                   DbPage **ppPage){
 767   RecoverInteriorCursor *pCursor = *ppCursor;
 768   while( 1 ){
 769     int rc;
 770     const unsigned char *pPageHeader;  /* Header of found page. */
 771
 772     /* Find a valid child page which isn't on the stack. */
 773     while( pCursor->iChild<pCursor->nChildren ){
 774       const unsigned iPage = interiorCursorChildPage(pCursor);
 775       pCursor->iChild++;
 776       if( interiorCursorPageInUse(pCursor, iPage) ){
 777         fprintf(stderr, "Loop detected at %d\n", iPage);
 778       }else{
 779         int rc = sqlite3PagerAcquire(pCursor->pPage->pPager, iPage, ppPage, 0);
 780         if( rc==SQLITE_OK ){
 781           return SQLITE_ROW;
 782         }
 783       }
 784     }
 785
 786     /* This page has no more children.  Get next page from parent. */
 787     if( !pCursor->pParent ){
 788       return SQLITE_DONE;
 789     }
 790     rc = interiorCursorNextPage(&pCursor->pParent, ppPage);
 791     if( rc!=SQLITE_ROW ){
 792       return rc;
 793     }
 794
 795     /* If a non-interior page is received, that either means that the
 796      * tree is uneven, or that a child was re-used (say as an overflow
 797      * page).  Remove this cursor and let the caller handle the page.
 798      */
 799     pPageHeader = PageHeader(*ppPage);
 800     if( pPageHeader[kiPageTypeOffset]!=kTableInteriorPage ){
 801       *ppCursor = pCursor->pParent;
 802       pCursor->pParent = NULL;
 803       interiorCursorDestroy(pCursor);
 804       return SQLITE_ROW;
 805     }
 806
 807     /* Iterate the new page. */
 808     interiorCursorSetPage(pCursor, *ppPage);
 809     *ppPage = NULL;
 810   }
 811
 812   assert(NULL);  /* NOTREACHED() */
 813   return SQLITE_CORRUPT;
 814 }
 815
 816 /* Large rows are spilled to overflow pages.  The row's main page
 817  * stores the overflow page number after the local payload, with a
 818  * linked list forward from there as necessary.  overflowMaybeCreate()
 819  * and overflowGetSegment() provide an abstraction for accessing such
 820  * data while centralizing the code.
 821  *
 822  * overflowDestroy - releases all resources associated with the structure.
 823  * overflowMaybeCreate - create the overflow structure if it is needed
 824  *                       to represent the given record.  See function comment.
 825  * overflowGetSegment - fetch a segment from the record, accounting
 826  *                      for overflow pages.  Segments which are not
 827  *                      entirely contained with a page are constructed
 828  *                      into a buffer which is returned.  See function comment.
 829  */
 830 typedef struct RecoverOverflow RecoverOverflow;
 831 struct RecoverOverflow {
 832   RecoverOverflow *pNextOverflow;
 833   DbPage *pPage;
 834   unsigned nPageSize;
 835 };
 836
 837 static void overflowDestroy(RecoverOverflow *pOverflow){
 838   while( pOverflow ){
 839     RecoverOverflow *p = pOverflow;
 840     pOverflow = p->pNextOverflow;
 841
 842     if( p->pPage ){
 843       sqlite3PagerUnref(p->pPage);
 844       p->pPage = NULL;
 845     }
 846
 847     memset(p, 0xA5, sizeof(*p));
 848     sqlite3_free(p);
 849   }
 850 }
 851
 852 /* Internal helper.  Used to detect if iPage would cause a loop. */
 853 static int overflowPageInUse(RecoverOverflow *pOverflow, unsigned iPage){
 854   while( pOverflow && pOverflow->pPage->pgno!=iPage ){
 855     pOverflow = pOverflow->pNextOverflow;
 856   }
 857   return pOverflow!=NULL;
 858 }
 859
 860 /* Setup to access an nRecordBytes record beginning at iRecordOffset
 861  * in pPage.  If nRecordBytes can be satisfied entirely from pPage,
 862  * then no overflow pages are needed an *pnLocalRecordBytes is set to
 863  * nRecordBytes.  Otherwise, *ppOverflow is set to the head of a list
 864  * of overflow pages, and *pnLocalRecordBytes is set to the number of
 865  * bytes local to pPage.
 866  *
 867  * overflowGetSegment() will do the right thing regardless of whether
 868  * those values are set to be in-page or not.
 869  */
 870 static int overflowMaybeCreate(DbPage *pPage, unsigned nPageSize,
 871                                unsigned iRecordOffset, unsigned nRecordBytes,
 872                                unsigned *pnLocalRecordBytes,
 873                                RecoverOverflow **ppOverflow){
 874   unsigned nLocalRecordBytes;  /* Record bytes in the leaf page. */
 875   unsigned iNextPage;          /* Next page number for record data. */
 876   unsigned nBytes;             /* Maximum record bytes as of current page. */
 877   int rc;
 878   RecoverOverflow *pFirstOverflow;  /* First in linked list of pages. */
 879   RecoverOverflow *pLastOverflow;   /* End of linked list. */
 880
 881   /* Calculations from the "Table B-Tree Leaf Cell" part of section
 882    * 1.5 of http://www.sqlite.org/fileformat2.html .  maxLocal and
 883    * minLocal to match naming in btree.c.
 884    */
 885   const unsigned maxLocal = nPageSize - 35;
 886   const unsigned minLocal = ((nPageSize-12)*32/255)-23;  /* m */
 887
 888   /* Always fit anything smaller than maxLocal. */
 889   if( nRecordBytes<=maxLocal ){
 890     *pnLocalRecordBytes = nRecordBytes;
 891     *ppOverflow = NULL;
 892     return SQLITE_OK;
 893   }
 894
 895   /* Calculate the remainder after accounting for minLocal on the leaf
 896    * page and what packs evenly into overflow pages.  If the remainder
 897    * does not fit into maxLocal, then a partially-full overflow page
 898    * will be required in any case, so store as little as possible locally.
 899    */
 900   nLocalRecordBytes = minLocal+((nRecordBytes-minLocal)%(nPageSize-4));
 901   if( maxLocal<nLocalRecordBytes ){
 902     nLocalRecordBytes = minLocal;
 903   }
 904
 905   /* Don't read off the end of the page. */
 906   if( iRecordOffset+nLocalRecordBytes+4>nPageSize ){
 907     return SQLITE_CORRUPT;
 908   }
 909
 910   /* First overflow page number is after the local bytes. */
 911   iNextPage =
 912       decodeUnsigned32(PageData(pPage, iRecordOffset + nLocalRecordBytes));
 913   nBytes = nLocalRecordBytes;
 914
 915   /* While there are more pages to read, and more bytes are needed,
 916    * get another page.
 917    */
 918   pFirstOverflow = pLastOverflow = NULL;
 919   rc = SQLITE_OK;
 920   while( iNextPage && nBytes<nRecordBytes ){
 921     RecoverOverflow *pOverflow;  /* New overflow page for the list. */
 922
 923     rc = sqlite3PagerAcquire(pPage->pPager, iNextPage, &pPage, 0);
 924     if( rc!=SQLITE_OK ){
 925       break;
 926     }
 927
 928     pOverflow = sqlite3_malloc(sizeof(RecoverOverflow));
 929     if( !pOverflow ){
 930       sqlite3PagerUnref(pPage);
 931       rc = SQLITE_NOMEM;
 932       break;
 933     }
 934     memset(pOverflow, 0, sizeof(*pOverflow));
 935     pOverflow->pPage = pPage;
 936     pOverflow->nPageSize = nPageSize;
 937
 938     if( !pFirstOverflow ){
 939       pFirstOverflow = pOverflow;
 940     }else{
 941       pLastOverflow->pNextOverflow = pOverflow;
 942     }
 943     pLastOverflow = pOverflow;
 944
 945     iNextPage = decodeUnsigned32(pPage->pData);
 946     nBytes += nPageSize-4;
 947
 948     /* Avoid loops. */
 949     if( overflowPageInUse(pFirstOverflow, iNextPage) ){
 950       fprintf(stderr, "Overflow loop detected at %d\n", iNextPage);
 951       rc = SQLITE_CORRUPT;
 952       break;
 953     }
 954   }
 955
 956   /* If there were not enough pages, or too many, things are corrupt.
 957    * Not having enough pages is an obvious problem, all the data
 958    * cannot be read.  Too many pages means that the contents of the
 959    * row between the main page and the overflow page(s) is
 960    * inconsistent (most likely one or more of the overflow pages does
 961    * not really belong to this row).
 962    */
 963   if( rc==SQLITE_OK && (nBytes<nRecordBytes || iNextPage) ){
 964     rc = SQLITE_CORRUPT;
 965   }
 966
 967   if( rc==SQLITE_OK ){
 968     *ppOverflow = pFirstOverflow;
 969     *pnLocalRecordBytes = nLocalRecordBytes;
 970   }else if( pFirstOverflow ){
 971     overflowDestroy(pFirstOverflow);
 972   }
 973   return rc;
 974 }
 975
 976 /* Use in concert with overflowMaybeCreate() to efficiently read parts
 977  * of a potentially-overflowing record.  pPage and iRecordOffset are
 978  * the values passed into overflowMaybeCreate(), nLocalRecordBytes and
 979  * pOverflow are the values returned by that call.
 980  *
 981  * On SQLITE_OK, *ppBase points to nRequestBytes of data at
 982  * iRequestOffset within the record.  If the data exists contiguously
 983  * in a page, a direct pointer is returned, otherwise a buffer from
 984  * sqlite3_malloc() is returned with the data.  *pbFree is set true if
 985  * sqlite3_free() should be called on *ppBase.
 986  */
 987 /* Operation of this function is subtle.  At any time, pPage is the
 988  * current page, with iRecordOffset and nLocalRecordBytes being record
 989  * data within pPage, and pOverflow being the overflow page after
 990  * pPage.  This allows the code to handle both the initial leaf page
 991  * and overflow pages consistently by adjusting the values
 992  * appropriately.
 993  */
 994 static int overflowGetSegment(DbPage *pPage, unsigned iRecordOffset,
 995                               unsigned nLocalRecordBytes,
 996                               RecoverOverflow *pOverflow,
 997                               unsigned iRequestOffset, unsigned nRequestBytes,
 998                               unsigned char **ppBase, int *pbFree){
 999   unsigned nBase;         /* Amount of data currently collected. */
1000   unsigned char *pBase;   /* Buffer to collect record data into. */
1001
1002   /* Skip to the page containing the start of the data. */
1003   while( iRequestOffset>=nLocalRecordBytes && pOverflow ){
1004     /* Factor out current page's contribution. */
1005     iRequestOffset -= nLocalRecordBytes;
1006
1007     /* Move forward to the next page in the list. */
1008     pPage = pOverflow->pPage;
1009     iRecordOffset = 4;
1010     nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1011     pOverflow = pOverflow->pNextOverflow;
1012   }
1013
1014   /* If the requested data is entirely within this page, return a
1015    * pointer into the page.
1016    */
1017   if( iRequestOffset+nRequestBytes<=nLocalRecordBytes ){
1018     /* TODO(shess): "assignment discards qualifiers from pointer target type"
1019      * Having ppBase be const makes sense, but sqlite3_free() takes non-const.
1020      */
1021     *ppBase = (unsigned char *)PageData(pPage, iRecordOffset + iRequestOffset);
1022     *pbFree = 0;
1023     return SQLITE_OK;
1024   }
1025
1026   /* The data range would require additional pages. */
1027   if( !pOverflow ){
1028     /* Should never happen, the range is outside the nRecordBytes
1029      * passed to overflowMaybeCreate().
1030      */
1031     assert(NULL);  /* NOTREACHED */
1032     return SQLITE_ERROR;
1033   }
1034
1035   /* Get a buffer to construct into. */
1036   nBase = 0;
1037   pBase = sqlite3_malloc(nRequestBytes);
1038   if( !pBase ){
1039     return SQLITE_NOMEM;
1040   }
1041   while( nBase<nRequestBytes ){
1042     /* Copy over data present on this page. */
1043     unsigned nCopyBytes = nRequestBytes - nBase;
1044     if( nLocalRecordBytes-iRequestOffset<nCopyBytes ){
1045       nCopyBytes = nLocalRecordBytes - iRequestOffset;
1046     }
1047     memcpy(pBase + nBase, PageData(pPage, iRecordOffset + iRequestOffset),
1048            nCopyBytes);
1049     nBase += nCopyBytes;
1050
1051     if( pOverflow ){
1052       /* Copy from start of record data in future pages. */
1053       iRequestOffset = 0;
1054
1055       /* Move forward to the next page in the list.  Should match
1056        * first while() loop.
1057        */
1058       pPage = pOverflow->pPage;
1059       iRecordOffset = 4;
1060       nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1061       pOverflow = pOverflow->pNextOverflow;
1062     }else if( nBase<nRequestBytes ){
1063       /* Ran out of overflow pages with data left to deliver.  Not
1064        * possible if the requested range fits within nRecordBytes
1065        * passed to overflowMaybeCreate() when creating pOverflow.
1066        */
1067       assert(NULL);  /* NOTREACHED */
1068       sqlite3_free(pBase);
1069       return SQLITE_ERROR;
1070     }
1071   }
1072   assert( nBase==nRequestBytes );
1073   *ppBase = pBase;
1074   *pbFree = 1;
1075   return SQLITE_OK;
1076 }
1077
1078 /* Primary structure for iterating the contents of a table.
1079  *
1080  * leafCursorDestroy - release all resources associated with the cursor.
1081  * leafCursorCreate - create a cursor to iterate items from tree at
1082  *                    the provided root page.
1083  * leafCursorNextValidCell - get the cursor ready to access data from
1084  *                           the next valid cell in the table.
1085  * leafCursorCellRowid - get the current cell's rowid.
1086  * leafCursorCellColumns - get current cell's column count.
1087  * leafCursorCellColInfo - get type and data for a column in current cell.
1088  *
1089  * leafCursorNextValidCell skips cells which fail simple integrity
1090  * checks, such as overlapping other cells, or being located at
1091  * impossible offsets, or where header data doesn't correctly describe
1092  * payload data.  Returns SQLITE_ROW if a valid cell is found,
1093  * SQLITE_DONE if all pages in the tree were exhausted.
1094  *
1095  * leafCursorCellColInfo() accounts for overflow pages in the style of
1096  * overflowGetSegment().
1097  */
1098 typedef struct RecoverLeafCursor RecoverLeafCursor;
1099 struct RecoverLeafCursor {
1100   RecoverInteriorCursor *pParent;  /* Parent node to this node. */
1101   DbPage *pPage;                   /* Reference to leaf page. */
1102   unsigned nPageSize;              /* Size of pPage. */
1103   unsigned nCells;                 /* Number of cells in pPage. */
1104   unsigned iCell;                  /* Current cell. */
1105
1106   /* Info parsed from data in iCell. */
1107   i64 iRowid;                      /* rowid parsed. */
1108   unsigned nRecordCols;            /* how many items in the record. */
1109   u64 iRecordOffset;               /* offset to record data. */
1110   /* TODO(shess): nRecordBytes and nRecordHeaderBytes are used in
1111    * leafCursorCellColInfo() to prevent buffer overruns.
1112    * leafCursorCellDecode() already verified that the cell is valid, so
1113    * those checks should be redundant.
1114    */
1115   u64 nRecordBytes;                /* Size of record data. */
1116   unsigned nLocalRecordBytes;      /* Amount of record data in-page. */
1117   unsigned nRecordHeaderBytes;     /* Size of record header data. */
1118   unsigned char *pRecordHeader;    /* Pointer to record header data. */
1119   int bFreeRecordHeader;           /* True if record header requires free. */
1120   RecoverOverflow *pOverflow;      /* Cell overflow info, if needed. */
1121 };
1122
1123 /* Internal helper shared between next-page and create-cursor.  If
1124  * pPage is a leaf page, it will be stored in the cursor and state
1125  * initialized for reading cells.
1126  *
1127  * If pPage is an interior page, a new parent cursor is created and
1128  * injected on the stack.  This is necessary to handle trees with
1129  * uneven depth, but also is used during initial setup.
1130  *
1131  * If pPage is not a table page at all, it is discarded.
1132  *
1133  * If SQLITE_OK is returned, the caller no longer owns pPage,
1134  * otherwise the caller is responsible for discarding it.
1135  */
1136 static int leafCursorLoadPage(RecoverLeafCursor *pCursor, DbPage *pPage){
1137   const unsigned char *pPageHeader;  /* Header of *pPage */
1138
1139   /* Release the current page. */
1140   if( pCursor->pPage ){
1141     sqlite3PagerUnref(pCursor->pPage);
1142     pCursor->pPage = NULL;
1143     pCursor->iCell = pCursor->nCells = 0;
1144   }
1145
1146   /* If the page is an unexpected interior node, inject a new stack
1147    * layer and try again from there.
1148    */
1149   pPageHeader = PageHeader(pPage);
1150   if( pPageHeader[kiPageTypeOffset]==kTableInteriorPage ){
1151     RecoverInteriorCursor *pParent;
1152     int rc = interiorCursorCreate(pCursor->pParent, pPage, pCursor->nPageSize,
1153                                   &pParent);
1154     if( rc!=SQLITE_OK ){
1155       return rc;
1156     }
1157     pCursor->pParent = pParent;
1158     return SQLITE_OK;
1159   }
1160
1161   /* Not a leaf page, skip it. */
1162   if( pPageHeader[kiPageTypeOffset]!=kTableLeafPage ){
1163     sqlite3PagerUnref(pPage);
1164     return SQLITE_OK;
1165   }
1166
1167   /* Take ownership of the page and start decoding. */
1168   pCursor->pPage = pPage;
1169   pCursor->iCell = 0;
1170   pCursor->nCells = decodeUnsigned16(pPageHeader + kiPageCellCountOffset);
1171   return SQLITE_OK;
1172 }
1173
1174 /* Get the next leaf-level page in the tree.  Returns SQLITE_ROW when
1175  * a leaf page is found, SQLITE_DONE when no more leaves exist, or any
1176  * error which occurred.
1177  */
1178 static int leafCursorNextPage(RecoverLeafCursor *pCursor){
1179   if( !pCursor->pParent ){
1180     return SQLITE_DONE;
1181   }
1182
1183   /* Repeatedly load the parent's next child page until a leaf is found. */
1184   do {
1185     DbPage *pNextPage;
1186     int rc = interiorCursorNextPage(&pCursor->pParent, &pNextPage);
1187     if( rc!=SQLITE_ROW ){
1188       assert( rc==SQLITE_DONE );
1189       return rc;
1190     }
1191
1192     rc = leafCursorLoadPage(pCursor, pNextPage);
1193     if( rc!=SQLITE_OK ){
1194       sqlite3PagerUnref(pNextPage);
1195       return rc;
1196     }
1197   } while( !pCursor->pPage );
1198
1199   return SQLITE_ROW;
1200 }
1201
1202 static void leafCursorDestroyCellData(RecoverLeafCursor *pCursor){
1203   if( pCursor->bFreeRecordHeader ){
1204     sqlite3_free(pCursor->pRecordHeader);
1205   }
1206   pCursor->bFreeRecordHeader = 0;
1207   pCursor->pRecordHeader = NULL;
1208
1209   if( pCursor->pOverflow ){
1210     overflowDestroy(pCursor->pOverflow);
1211     pCursor->pOverflow = NULL;
1212   }
1213 }
1214
1215 static void leafCursorDestroy(RecoverLeafCursor *pCursor){
1216   leafCursorDestroyCellData(pCursor);
1217
1218   if( pCursor->pParent ){
1219     interiorCursorDestroy(pCursor->pParent);
1220     pCursor->pParent = NULL;
1221   }
1222
1223   if( pCursor->pPage ){
1224     sqlite3PagerUnref(pCursor->pPage);
1225     pCursor->pPage = NULL;
1226   }
1227
1228   memset(pCursor, 0xA5, sizeof(*pCursor));
1229   sqlite3_free(pCursor);
1230 }
1231
1232 /* Create a cursor to iterate the rows from the leaf pages of a table
1233  * rooted at iRootPage.
1234  */
1235 /* TODO(shess): recoverOpen() calls this to setup the cursor, and I
1236  * think that recoverFilter() may make a hard assumption that the
1237  * cursor returned will turn up at least one valid cell.
1238  *
1239  * The cases I can think of which break this assumption are:
1240  * - pPage is a valid leaf page with no valid cells.
1241  * - pPage is a valid interior page with no valid leaves.
1242  * - pPage is a valid interior page who's leaves contain no valid cells.
1243  * - pPage is not a valid leaf or interior page.
1244  */
1245 static int leafCursorCreate(Pager *pPager, unsigned nPageSize,
1246                             u32 iRootPage, RecoverLeafCursor **ppCursor){
1247   DbPage *pPage;               /* Reference to page at iRootPage. */
1248   RecoverLeafCursor *pCursor;  /* Leaf cursor being constructed. */
1249   int rc;
1250
1251   /* Start out with the root page. */
1252   rc = sqlite3PagerAcquire(pPager, iRootPage, &pPage, 0);
1253   if( rc!=SQLITE_OK ){
1254     return rc;
1255   }
1256
1257   pCursor = sqlite3_malloc(sizeof(RecoverLeafCursor));
1258   if( !pCursor ){
1259     sqlite3PagerUnref(pPage);
1260     return SQLITE_NOMEM;
1261   }
1262   memset(pCursor, 0, sizeof(*pCursor));
1263
1264   pCursor->nPageSize = nPageSize;
1265
1266   rc = leafCursorLoadPage(pCursor, pPage);
1267   if( rc!=SQLITE_OK ){
1268     sqlite3PagerUnref(pPage);
1269     leafCursorDestroy(pCursor);
1270     return rc;
1271   }
1272
1273   /* pPage wasn't a leaf page, find the next leaf page. */
1274   if( !pCursor->pPage ){
1275     rc = leafCursorNextPage(pCursor);
1276     if( rc!=SQLITE_DONE && rc!=SQLITE_ROW ){
1277       leafCursorDestroy(pCursor);
1278       return rc;
1279     }
1280   }
1281
1282   *ppCursor = pCursor;
1283   return SQLITE_OK;
1284 }
1285
1286 /* Useful for setting breakpoints. */
1287 static int ValidateError(){
1288   return SQLITE_ERROR;
1289 }
1290
1291 /* Setup the cursor for reading the information from cell iCell. */
1292 static int leafCursorCellDecode(RecoverLeafCursor *pCursor){
1293   const unsigned char *pPageHeader;  /* Header of current page. */
1294   const unsigned char *pPageEnd;     /* Byte after end of current page. */
1295   const unsigned char *pCellOffsets; /* Pointer to page's cell offsets. */
1296   unsigned iCellOffset;              /* Offset of current cell (iCell). */
1297   const unsigned char *pCell;        /* Pointer to data at iCellOffset. */
1298   unsigned nCellMaxBytes;            /* Maximum local size of iCell. */
1299   unsigned iEndOffset;               /* End of iCell's in-page data. */
1300   u64 nRecordBytes;                  /* Expected size of cell, w/overflow. */
1301   u64 iRowid;                        /* iCell's rowid (in table). */
1302   unsigned nRead;                    /* Amount of cell read. */
1303   unsigned nRecordHeaderRead;        /* Header data read. */
1304   u64 nRecordHeaderBytes;            /* Header size expected. */
1305   unsigned nRecordCols;              /* Columns read from header. */
1306   u64 nRecordColBytes;               /* Bytes in payload for those columns. */
1307   unsigned i;
1308   int rc;
1309
1310   assert( pCursor->iCell<pCursor->nCells );
1311
1312   leafCursorDestroyCellData(pCursor);
1313
1314   /* Find the offset to the row. */
1315   pPageHeader = PageHeader(pCursor->pPage);
1316   pCellOffsets = pPageHeader + knPageLeafHeaderBytes;
1317   pPageEnd = PageData(pCursor->pPage, pCursor->nPageSize);
1318   if( pCellOffsets + pCursor->iCell*2 + 2 > pPageEnd ){
1319     return ValidateError();
1320   }
1321   iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iCell*2);
1322   if( iCellOffset>=pCursor->nPageSize ){
1323     return ValidateError();
1324   }
1325
1326   pCell = PageData(pCursor->pPage, iCellOffset);
1327   nCellMaxBytes = pCursor->nPageSize - iCellOffset;
1328
1329   /* B-tree leaf cells lead with varint record size, varint rowid and
1330    * varint header size.
1331    */
1332   /* TODO(shess): The smallest page size is 512 bytes, which has an m
1333    * of 39.  Three varints need at most 27 bytes to encode.  I think.
1334    */
1335   if( !checkVarints(pCell, nCellMaxBytes, 3) ){
1336     return ValidateError();
1337   }
1338
1339   nRead = getVarint(pCell, &nRecordBytes);
1340   assert( iCellOffset+nRead<=pCursor->nPageSize );
1341   pCursor->nRecordBytes = nRecordBytes;
1342
1343   nRead += getVarint(pCell + nRead, &iRowid);
1344   assert( iCellOffset+nRead<=pCursor->nPageSize );
1345   pCursor->iRowid = (i64)iRowid;
1346
1347   pCursor->iRecordOffset = iCellOffset + nRead;
1348
1349   /* Start overflow setup here because nLocalRecordBytes is needed to
1350    * check cell overlap.
1351    */
1352   rc = overflowMaybeCreate(pCursor->pPage, pCursor->nPageSize,
1353                            pCursor->iRecordOffset, pCursor->nRecordBytes,
1354                            &pCursor->nLocalRecordBytes,
1355                            &pCursor->pOverflow);
1356   if( rc!=SQLITE_OK ){
1357     return ValidateError();
1358   }
1359
1360   /* Check that no other cell starts within this cell. */
1361   iEndOffset = pCursor->iRecordOffset + pCursor->nLocalRecordBytes;
1362   for( i=0; i<pCursor->nCells && pCellOffsets + i*2 + 2 <= pPageEnd; ++i ){
1363     const unsigned iOtherOffset = decodeUnsigned16(pCellOffsets + i*2);
1364     if( iOtherOffset>iCellOffset && iOtherOffset<iEndOffset ){
1365       return ValidateError();
1366     }
1367   }
1368
1369   nRecordHeaderRead = getVarint(pCell + nRead, &nRecordHeaderBytes);
1370   assert( nRecordHeaderBytes<=nRecordBytes );
1371   pCursor->nRecordHeaderBytes = nRecordHeaderBytes;
1372
1373   /* Large headers could overflow if pages are small. */
1374   rc = overflowGetSegment(pCursor->pPage,
1375                           pCursor->iRecordOffset, pCursor->nLocalRecordBytes,
1376                           pCursor->pOverflow, 0, nRecordHeaderBytes,
1377                           &pCursor->pRecordHeader, &pCursor->bFreeRecordHeader);
1378   if( rc!=SQLITE_OK ){
1379     return ValidateError();
1380   }
1381
1382   /* Tally up the column count and size of data. */
1383   nRecordCols = 0;
1384   nRecordColBytes = 0;
1385   while( nRecordHeaderRead<nRecordHeaderBytes ){
1386     u64 iSerialType;  /* Type descriptor for current column. */
1387     if( !checkVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1388                      nRecordHeaderBytes - nRecordHeaderRead) ){
1389       return ValidateError();
1390     }
1391     nRecordHeaderRead += getVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1392                                    &iSerialType);
1393     if( iSerialType==10 || iSerialType==11 ){
1394       return ValidateError();
1395     }
1396     nRecordColBytes += SerialTypeLength(iSerialType);
1397     nRecordCols++;
1398   }
1399   pCursor->nRecordCols = nRecordCols;
1400
1401   /* Parsing the header used as many bytes as expected. */
1402   if( nRecordHeaderRead!=nRecordHeaderBytes ){
1403     return ValidateError();
1404   }
1405
1406   /* Calculated record is size of expected record. */
1407   if( nRecordHeaderBytes+nRecordColBytes!=nRecordBytes ){
1408     return ValidateError();
1409   }
1410
1411   return SQLITE_OK;
1412 }
1413
1414 static i64 leafCursorCellRowid(RecoverLeafCursor *pCursor){
1415   return pCursor->iRowid;
1416 }
1417
1418 static unsigned leafCursorCellColumns(RecoverLeafCursor *pCursor){
1419   return pCursor->nRecordCols;
1420 }
1421
1422 /* Get the column info for the cell.  Pass NULL for ppBase to prevent
1423  * retrieving the data segment.  If *pbFree is true, *ppBase must be
1424  * freed by the caller using sqlite3_free().
1425  */
1426 static int leafCursorCellColInfo(RecoverLeafCursor *pCursor,
1427                                  unsigned iCol, u64 *piColType,
1428                                  unsigned char **ppBase, int *pbFree){
1429   const unsigned char *pRecordHeader;  /* Current cell's header. */
1430   u64 nRecordHeaderBytes;              /* Bytes in pRecordHeader. */
1431   unsigned nRead;                      /* Bytes read from header. */
1432   u64 iColEndOffset;                   /* Offset to end of column in cell. */
1433   unsigned nColsSkipped;               /* Count columns as procesed. */
1434   u64 iSerialType;                     /* Type descriptor for current column. */
1435
1436   /* Implicit NULL for columns past the end.  This case happens when
1437    * rows have not been updated since an ALTER TABLE added columns.
1438    * It is more convenient to address here than in callers.
1439    */
1440   if( iCol>=pCursor->nRecordCols ){
1441     *piColType = 0;
1442     if( ppBase ){
1443       *ppBase = 0;
1444       *pbFree = 0;
1445     }
1446     return SQLITE_OK;
1447   }
1448
1449   /* Must be able to decode header size. */
1450   pRecordHeader = pCursor->pRecordHeader;
1451   if( !checkVarint(pRecordHeader, pCursor->nRecordHeaderBytes) ){
1452     return SQLITE_CORRUPT;
1453   }
1454
1455   /* Rather than caching the header size and how many bytes it took,
1456    * decode it every time.
1457    */
1458   nRead = getVarint(pRecordHeader, &nRecordHeaderBytes);
1459   assert( nRecordHeaderBytes==pCursor->nRecordHeaderBytes );
1460
1461   /* Scan forward to the indicated column.  Scans to _after_ column
1462    * for later range checking.
1463    */
1464   /* TODO(shess): This could get expensive for very wide tables.  An
1465    * array of iSerialType could be built in leafCursorCellDecode(), but
1466    * the number of columns is dynamic per row, so it would add memory
1467    * management complexity.  Enough info to efficiently forward
1468    * iterate could be kept, if all clients forward iterate
1469    * (recoverColumn() may not).
1470    */
1471   iColEndOffset = 0;
1472   nColsSkipped = 0;
1473   while( nColsSkipped<=iCol && nRead<nRecordHeaderBytes ){
1474     if( !checkVarint(pRecordHeader + nRead, nRecordHeaderBytes - nRead) ){
1475       return SQLITE_CORRUPT;
1476     }
1477     nRead += getVarint(pRecordHeader + nRead, &iSerialType);
1478     iColEndOffset += SerialTypeLength(iSerialType);
1479     nColsSkipped++;
1480   }
1481
1482   /* Column's data extends past record's end. */
1483   if( nRecordHeaderBytes+iColEndOffset>pCursor->nRecordBytes ){
1484     return SQLITE_CORRUPT;
1485   }
1486
1487   *piColType = iSerialType;
1488   if( ppBase ){
1489     const u32 nColBytes = SerialTypeLength(iSerialType);
1490
1491     /* Offset from start of record to beginning of column. */
1492     const unsigned iColOffset = nRecordHeaderBytes+iColEndOffset-nColBytes;
1493
1494     return overflowGetSegment(pCursor->pPage, pCursor->iRecordOffset,
1495                               pCursor->nLocalRecordBytes, pCursor->pOverflow,
1496                               iColOffset, nColBytes, ppBase, pbFree);
1497   }
1498   return SQLITE_OK;
1499 }
1500
1501 static int leafCursorNextValidCell(RecoverLeafCursor *pCursor){
1502   while( 1 ){
1503     int rc;
1504
1505     /* Move to the next cell. */
1506     pCursor->iCell++;
1507
1508     /* No more cells, get the next leaf. */
1509     if( pCursor->iCell>=pCursor->nCells ){
1510       rc = leafCursorNextPage(pCursor);
1511       if( rc!=SQLITE_ROW ){
1512         return rc;
1513       }
1514       assert( pCursor->iCell==0 );
1515     }
1516
1517     /* If the cell is valid, indicate that a row is available. */
1518     rc = leafCursorCellDecode(pCursor);
1519     if( rc==SQLITE_OK ){
1520       return SQLITE_ROW;
1521     }
1522
1523     /* Iterate until done or a valid row is found. */
1524     /* TODO(shess): Remove debugging output. */
1525     fprintf(stderr, "Skipping invalid cell\n");
1526   }
1527   return SQLITE_ERROR;
1528 }
1529
1530 typedef struct Recover Recover;
1531 struct Recover {
1532   sqlite3_vtab base;
1533   sqlite3 *db;                /* Host database connection */
1534   char *zDb;                  /* Database containing target table */
1535   char *zTable;               /* Target table */
1536   unsigned nCols;             /* Number of columns in target table */
1537   unsigned char *pTypes;      /* Types of columns in target table */
1538 };
1539
1540 /* Internal helper for deleting the module. */
1541 static void recoverRelease(Recover *pRecover){
1542   sqlite3_free(pRecover->zDb);
1543   sqlite3_free(pRecover->zTable);
1544   sqlite3_free(pRecover->pTypes);
1545   memset(pRecover, 0xA5, sizeof(*pRecover));
1546   sqlite3_free(pRecover);
1547 }
1548
1549 /* Helper function for initializing the module.  Forward-declared so
1550  * recoverCreate() and recoverConnect() can see it.
1551  */
1552 static int recoverInit(
1553   sqlite3 *, void *, int, const char *const*, sqlite3_vtab **, char **
1554 );
1555
1556 static int recoverCreate(
1557   sqlite3 *db,
1558   void *pAux,
1559   int argc, const char *const*argv,
1560   sqlite3_vtab **ppVtab,
1561   char **pzErr
1562 ){
1563   FNENTRY();
1564   return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1565 }
1566
1567 /* This should never be called. */
1568 static int recoverConnect(
1569   sqlite3 *db,
1570   void *pAux,
1571   int argc, const char *const*argv,
1572   sqlite3_vtab **ppVtab,
1573   char **pzErr
1574 ){
1575   FNENTRY();
1576   return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1577 }
1578
1579 /* No indices supported. */
1580 static int recoverBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
1581   FNENTRY();
1582   return SQLITE_OK;
1583 }
1584
1585 /* Logically, this should never be called. */
1586 static int recoverDisconnect(sqlite3_vtab *pVtab){
1587   FNENTRY();
1588   recoverRelease((Recover*)pVtab);
1589   return SQLITE_OK;
1590 }
1591
1592 static int recoverDestroy(sqlite3_vtab *pVtab){
1593   FNENTRY();
1594   recoverRelease((Recover*)pVtab);
1595   return SQLITE_OK;
1596 }
1597
1598 typedef struct RecoverCursor RecoverCursor;
1599 struct RecoverCursor {
1600   sqlite3_vtab_cursor base;
1601   RecoverLeafCursor *pLeafCursor;
1602   int iEncoding;
1603   int bEOF;
1604 };
1605
1606 static int recoverOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
1607   Recover *pRecover = (Recover*)pVTab;
1608   u32 iRootPage;                   /* Root page of the backing table. */
1609   int iEncoding;                   /* UTF encoding for backing database. */
1610   unsigned nPageSize;              /* Size of pages in backing database. */
1611   Pager *pPager;                   /* Backing database pager. */
1612   RecoverLeafCursor *pLeafCursor;  /* Cursor to read table's leaf pages. */
1613   RecoverCursor *pCursor;          /* Cursor to read rows from leaves. */
1614   int rc;
1615
1616   FNENTRY();
1617
1618   iRootPage = 0;
1619   rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable,
1620                    &iRootPage);
1621   if( rc!=SQLITE_OK ){
1622     return rc;
1623   }
1624
1625   iEncoding = 0;
1626   rc = getEncoding(pRecover->db, pRecover->zDb, &iEncoding);
1627   if( rc!=SQLITE_OK ){
1628     return rc;
1629   }
1630
1631   rc = GetPager(pRecover->db, pRecover->zDb, &pPager, &nPageSize);
1632   if( rc!=SQLITE_OK ){
1633     return rc;
1634   }
1635
1636   rc = leafCursorCreate(pPager, nPageSize, iRootPage, &pLeafCursor);
1637   if( rc!=SQLITE_OK ){
1638     return rc;
1639   }
1640
1641   pCursor = sqlite3_malloc(sizeof(RecoverCursor));
1642   if( !pCursor ){
1643     leafCursorDestroy(pLeafCursor);
1644     return SQLITE_NOMEM;
1645   }
1646   memset(pCursor, 0, sizeof(*pCursor));
1647   pCursor->base.pVtab = pVTab;
1648   pCursor->pLeafCursor = pLeafCursor;
1649   pCursor->iEncoding = iEncoding;
1650
1651   /* If no leaf pages were found, empty result set. */
1652   /* TODO(shess): leafCursorNextValidCell() would return SQLITE_ROW or
1653    * SQLITE_DONE to indicate whether there is further data to consider.
1654    */
1655   pCursor->bEOF = (pLeafCursor->pPage==NULL);
1656
1657   *ppCursor = (sqlite3_vtab_cursor*)pCursor;
1658   return SQLITE_OK;
1659 }
1660
1661 static int recoverClose(sqlite3_vtab_cursor *cur){
1662   RecoverCursor *pCursor = (RecoverCursor*)cur;
1663   FNENTRY();
1664   if( pCursor->pLeafCursor ){
1665     leafCursorDestroy(pCursor->pLeafCursor);
1666     pCursor->pLeafCursor = NULL;
1667   }
1668   memset(pCursor, 0xA5, sizeof(*pCursor));
1669   sqlite3_free(cur);
1670   return SQLITE_OK;
1671 }
1672
1673 /* Helpful place to set a breakpoint. */
1674 static int RecoverInvalidCell(){
1675   return SQLITE_ERROR;
1676 }
1677
1678 /* Returns SQLITE_OK if the cell has an appropriate number of columns
1679  * with the appropriate types of data.
1680  */
1681 static int recoverValidateLeafCell(Recover *pRecover, RecoverCursor *pCursor){
1682   unsigned i;
1683
1684   /* If the row's storage has too many columns, skip it. */
1685   if( leafCursorCellColumns(pCursor->pLeafCursor)>pRecover->nCols ){
1686     return RecoverInvalidCell();
1687   }
1688
1689   /* Skip rows with unexpected types. */
1690   for( i=0; i<pRecover->nCols; ++i ){
1691     u64 iType;  /* Storage type of column i. */
1692     int rc;
1693
1694     /* ROWID alias. */
1695     if( (pRecover->pTypes[i]&MASK_ROWID) ){
1696       continue;
1697     }
1698
1699     rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iType, NULL, NULL);
1700     assert( rc==SQLITE_OK );
1701     if( rc!=SQLITE_OK || !SerialTypeIsCompatible(iType, pRecover->pTypes[i]) ){
1702       return RecoverInvalidCell();
1703     }
1704   }
1705
1706   return SQLITE_OK;
1707 }
1708
1709 static int recoverNext(sqlite3_vtab_cursor *pVtabCursor){
1710   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1711   Recover *pRecover = (Recover*)pCursor->base.pVtab;
1712   int rc;
1713
1714   FNENTRY();
1715
1716   /* Scan forward to the next cell with valid storage, then check that
1717    * the stored data matches the schema.
1718    */
1719   while( (rc = leafCursorNextValidCell(pCursor->pLeafCursor))==SQLITE_ROW ){
1720     if( recoverValidateLeafCell(pRecover, pCursor)==SQLITE_OK ){
1721       return SQLITE_OK;
1722     }
1723   }
1724
1725   if( rc==SQLITE_DONE ){
1726     pCursor->bEOF = 1;
1727     return SQLITE_OK;
1728   }
1729
1730   assert( rc!=SQLITE_OK );
1731   return rc;
1732 }
1733
1734 static int recoverFilter(
1735   sqlite3_vtab_cursor *pVtabCursor,
1736   int idxNum, const char *idxStr,
1737   int argc, sqlite3_value **argv
1738 ){
1739   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1740   Recover *pRecover = (Recover*)pCursor->base.pVtab;
1741   int rc;
1742
1743   FNENTRY();
1744
1745   /* There were no valid leaf pages in the table. */
1746   if( pCursor->bEOF ){
1747     return SQLITE_OK;
1748   }
1749
1750   /* Load the first cell, and iterate forward if it's not valid.  If no cells at
1751    * all are valid, recoverNext() sets bEOF and returns appropriately.
1752    */
1753   rc = leafCursorCellDecode(pCursor->pLeafCursor);
1754   if( rc!=SQLITE_OK || recoverValidateLeafCell(pRecover, pCursor)!=SQLITE_OK ){
1755     return recoverNext(pVtabCursor);
1756   }
1757
1758   return SQLITE_OK;
1759 }
1760
1761 static int recoverEof(sqlite3_vtab_cursor *pVtabCursor){
1762   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1763   FNENTRY();
1764   return pCursor->bEOF;
1765 }
1766
1767 static int recoverColumn(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
1768   RecoverCursor *pCursor = (RecoverCursor*)cur;
1769   Recover *pRecover = (Recover*)pCursor->base.pVtab;
1770   u64 iColType;             /* Storage type of column i. */
1771   unsigned char *pColData;  /* Column i's data. */
1772   int shouldFree;           /* Non-zero if pColData should be freed. */
1773   int rc;
1774
1775   FNENTRY();
1776
1777   if( i>=pRecover->nCols ){
1778     return SQLITE_ERROR;
1779   }
1780
1781   /* ROWID alias. */
1782   if( (pRecover->pTypes[i]&MASK_ROWID) ){
1783     sqlite3_result_int64(ctx, leafCursorCellRowid(pCursor->pLeafCursor));
1784     return SQLITE_OK;
1785   }
1786
1787   pColData = NULL;
1788   shouldFree = 0;
1789   rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iColType,
1790                              &pColData, &shouldFree);
1791   if( rc!=SQLITE_OK ){
1792     return rc;
1793   }
1794   /* recoverValidateLeafCell() should guarantee that this will never
1795    * occur.
1796    */
1797   if( !SerialTypeIsCompatible(iColType, pRecover->pTypes[i]) ){
1798     if( shouldFree ){
1799       sqlite3_free(pColData);
1800     }
1801     return SQLITE_ERROR;
1802   }
1803
1804   switch( iColType ){
1805     case 0 : sqlite3_result_null(ctx); break;
1806     case 1 : sqlite3_result_int64(ctx, decodeSigned(pColData, 1)); break;
1807     case 2 : sqlite3_result_int64(ctx, decodeSigned(pColData, 2)); break;
1808     case 3 : sqlite3_result_int64(ctx, decodeSigned(pColData, 3)); break;
1809     case 4 : sqlite3_result_int64(ctx, decodeSigned(pColData, 4)); break;
1810     case 5 : sqlite3_result_int64(ctx, decodeSigned(pColData, 6)); break;
1811     case 6 : sqlite3_result_int64(ctx, decodeSigned(pColData, 8)); break;
1812     case 7 : sqlite3_result_double(ctx, decodeFloat64(pColData)); break;
1813     case 8 : sqlite3_result_int(ctx, 0); break;
1814     case 9 : sqlite3_result_int(ctx, 1); break;
1815     case 10 : assert( iColType!=10 ); break;
1816     case 11 : assert( iColType!=11 ); break;
1817
1818     default : {
1819       u32 l = SerialTypeLength(iColType);
1820
1821       /* If pColData was already allocated, arrange to pass ownership. */
1822       sqlite3_destructor_type pFn = SQLITE_TRANSIENT;
1823       if( shouldFree ){
1824         pFn = sqlite3_free;
1825         shouldFree = 0;
1826       }
1827
1828       if( SerialTypeIsBlob(iColType) ){
1829         sqlite3_result_blob(ctx, pColData, l, pFn);
1830       }else{
1831         if( pCursor->iEncoding==SQLITE_UTF16LE ){
1832           sqlite3_result_text16le(ctx, (const void*)pColData, l, pFn);
1833         }else if( pCursor->iEncoding==SQLITE_UTF16BE ){
1834           sqlite3_result_text16be(ctx, (const void*)pColData, l, pFn);
1835         }else{
1836           sqlite3_result_text(ctx, (const char*)pColData, l, pFn);
1837         }
1838       }
1839     } break;
1840   }
1841   if( shouldFree ){
1842     sqlite3_free(pColData);
1843   }
1844   return SQLITE_OK;
1845 }
1846
1847 static int recoverRowid(sqlite3_vtab_cursor *pVtabCursor, sqlite_int64 *pRowid){
1848   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1849   FNENTRY();
1850   *pRowid = leafCursorCellRowid(pCursor->pLeafCursor);
1851   return SQLITE_OK;
1852 }
1853
1854 static sqlite3_module recoverModule = {
1855   0,                         /* iVersion */
1856   recoverCreate,             /* xCreate - create a table */
1857   recoverConnect,            /* xConnect - connect to an existing table */
1858   recoverBestIndex,          /* xBestIndex - Determine search strategy */
1859   recoverDisconnect,         /* xDisconnect - Disconnect from a table */
1860   recoverDestroy,            /* xDestroy - Drop a table */
1861   recoverOpen,               /* xOpen - open a cursor */
1862   recoverClose,              /* xClose - close a cursor */
1863   recoverFilter,             /* xFilter - configure scan constraints */
1864   recoverNext,               /* xNext - advance a cursor */
1865   recoverEof,                /* xEof */
1866   recoverColumn,             /* xColumn - read data */
1867   recoverRowid,              /* xRowid - read data */
1868   0,                         /* xUpdate - write data */
1869   0,                         /* xBegin - begin transaction */
1870   0,                         /* xSync - sync transaction */
1871   0,                         /* xCommit - commit transaction */
1872   0,                         /* xRollback - rollback transaction */
1873   0,                         /* xFindFunction - function overloading */
1874   0,                         /* xRename - rename the table */
1875 };
1876
1877 int recoverVtableInit(sqlite3 *db){
1878   return sqlite3_create_module_v2(db, "recover", &recoverModule, NULL, 0);
1879 }
1880
1881 /* This section of code is for parsing the create input and
1882  * initializing the module.
1883  */
1884
1885 /* Find the next word in zText and place the endpoints in pzWord*.
1886  * Returns true if the word is non-empty.  "Word" is defined as
1887  * ASCII alphanumeric plus '_' at this time.
1888  */
1889 static int findWord(const char *zText,
1890                     const char **pzWordStart, const char **pzWordEnd){
1891   int r;
1892   while( ascii_isspace(*zText) ){
1893     zText++;
1894   }
1895   *pzWordStart = zText;
1896   while( ascii_isalnum(*zText) || *zText=='_' ){
1897     zText++;
1898   }
1899   r = zText>*pzWordStart;  /* In case pzWordStart==pzWordEnd */
1900   *pzWordEnd = zText;
1901   return r;
1902 }
1903
1904 /* Return true if the next word in zText is zWord, also setting
1905  * *pzContinue to the character after the word.
1906  */
1907 static int expectWord(const char *zText, const char *zWord,
1908                       const char **pzContinue){
1909   const char *zWordStart, *zWordEnd;
1910   if( findWord(zText, &zWordStart, &zWordEnd) &&
1911       ascii_strncasecmp(zWord, zWordStart, zWordEnd - zWordStart)==0 ){
1912     *pzContinue = zWordEnd;
1913     return 1;
1914   }
1915   return 0;
1916 }
1917
1918 /* Parse the name and type information out of parameter.  In case of
1919  * success, *pzNameStart/End contain the name of the column,
1920  * *pzTypeStart/End contain the top-level type, and *pTypeMask has the
1921  * type mask to use for the column.
1922  */
1923 static int findNameAndType(const char *parameter,
1924                            const char **pzNameStart, const char **pzNameEnd,
1925                            const char **pzTypeStart, const char **pzTypeEnd,
1926                            unsigned char *pTypeMask){
1927   unsigned nNameLen;   /* Length of found name. */
1928   const char *zEnd;    /* Current end of parsed column information. */
1929   int bNotNull;        /* Non-zero if NULL is not allowed for name. */
1930   int bStrict;         /* Non-zero if column requires exact type match. */
1931   const char *zDummy;  /* Dummy parameter, result unused. */
1932   unsigned i;
1933
1934   /* strictMask is used for STRICT, strictMask|otherMask if STRICT is
1935    * not supplied.  zReplace provides an alternate type to expose to
1936    * the caller.
1937    */
1938   static struct {
1939     const char *zName;
1940     unsigned char strictMask;
1941     unsigned char otherMask;
1942     const char *zReplace;
1943   } kTypeInfo[] = {
1944     { "ANY",
1945       MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL,
1946       0, "",
1947     },
1948     { "ROWID",   MASK_INTEGER | MASK_ROWID,             0, "INTEGER", },
1949     { "INTEGER", MASK_INTEGER | MASK_NULL,              0, NULL, },
1950     { "FLOAT",   MASK_FLOAT | MASK_NULL,                MASK_INTEGER, NULL, },
1951     { "NUMERIC", MASK_INTEGER | MASK_FLOAT | MASK_NULL, MASK_TEXT, NULL, },
1952     { "TEXT",    MASK_TEXT | MASK_NULL,                 MASK_BLOB, NULL, },
1953     { "BLOB",    MASK_BLOB | MASK_NULL,                 0, NULL, },
1954   };
1955
1956   if( !findWord(parameter, pzNameStart, pzNameEnd) ){
1957     return SQLITE_MISUSE;
1958   }
1959
1960   /* Manifest typing, accept any storage type. */
1961   if( !findWord(*pzNameEnd, pzTypeStart, pzTypeEnd) ){
1962     *pzTypeEnd = *pzTypeStart = "";
1963     *pTypeMask = MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL;
1964     return SQLITE_OK;
1965   }
1966
1967   nNameLen = *pzTypeEnd - *pzTypeStart;
1968   for( i=0; i<ArraySize(kTypeInfo); ++i ){
1969     if( ascii_strncasecmp(kTypeInfo[i].zName, *pzTypeStart, nNameLen)==0 ){
1970       break;
1971     }
1972   }
1973   if( i==ArraySize(kTypeInfo) ){
1974     return SQLITE_MISUSE;
1975   }
1976
1977   zEnd = *pzTypeEnd;
1978   bStrict = 0;
1979   if( expectWord(zEnd, "STRICT", &zEnd) ){
1980     /* TODO(shess): Ick.  But I don't want another single-purpose
1981      * flag, either.
1982      */
1983     if( kTypeInfo[i].zReplace && !kTypeInfo[i].zReplace[0] ){
1984       return SQLITE_MISUSE;
1985     }
1986     bStrict = 1;
1987   }
1988
1989   bNotNull = 0;
1990   if( expectWord(zEnd, "NOT", &zEnd) ){
1991     if( expectWord(zEnd, "NULL", &zEnd) ){
1992       bNotNull = 1;
1993     }else{
1994       /* Anything other than NULL after NOT is an error. */
1995       return SQLITE_MISUSE;
1996     }
1997   }
1998
1999   /* Anything else is an error. */
2000   if( findWord(zEnd, &zDummy, &zDummy) ){
2001     return SQLITE_MISUSE;
2002   }
2003
2004   *pTypeMask = kTypeInfo[i].strictMask;
2005   if( !bStrict ){
2006     *pTypeMask |= kTypeInfo[i].otherMask;
2007   }
2008   if( bNotNull ){
2009     *pTypeMask &= ~MASK_NULL;
2010   }
2011   if( kTypeInfo[i].zReplace ){
2012     *pzTypeStart = kTypeInfo[i].zReplace;
2013     *pzTypeEnd = *pzTypeStart + strlen(*pzTypeStart);
2014   }
2015   return SQLITE_OK;
2016 }
2017
2018 /* Parse the arguments, placing type masks in *pTypes and the exposed
2019  * schema in *pzCreateSql (for sqlite3_declare_vtab).
2020  */
2021 static int ParseColumnsAndGenerateCreate(unsigned nCols,
2022                                          const char *const *pCols,
2023                                          char **pzCreateSql,
2024                                          unsigned char *pTypes,
2025                                          char **pzErr){
2026   unsigned i;
2027   char *zCreateSql = sqlite3_mprintf("CREATE TABLE x(");
2028   if( !zCreateSql ){
2029     return SQLITE_NOMEM;
2030   }
2031
2032   for( i=0; i<nCols; i++ ){
2033     const char *zSep = (i < nCols - 1 ? ", " : ")");
2034     const char *zNotNull = "";
2035     const char *zNameStart, *zNameEnd;
2036     const char *zTypeStart, *zTypeEnd;
2037     int rc = findNameAndType(pCols[i],
2038                              &zNameStart, &zNameEnd,
2039                              &zTypeStart, &zTypeEnd,
2040                              &pTypes[i]);
2041     if( rc!=SQLITE_OK ){
2042       *pzErr = sqlite3_mprintf("unable to parse column %d", i);
2043       sqlite3_free(zCreateSql);
2044       return rc;
2045     }
2046
2047     if( !(pTypes[i]&MASK_NULL) ){
2048       zNotNull = " NOT NULL";
2049     }
2050
2051     /* Add name and type to the create statement. */
2052     zCreateSql = sqlite3_mprintf("%z%.*s %.*s%s%s",
2053                                  zCreateSql,
2054                                  zNameEnd - zNameStart, zNameStart,
2055                                  zTypeEnd - zTypeStart, zTypeStart,
2056                                  zNotNull, zSep);
2057     if( !zCreateSql ){
2058       return SQLITE_NOMEM;
2059     }
2060   }
2061
2062   *pzCreateSql = zCreateSql;
2063   return SQLITE_OK;
2064 }
2065
2066 /* Helper function for initializing the module. */
2067 /* argv[0] module name
2068  * argv[1] db name for virtual table
2069  * argv[2] virtual table name
2070  * argv[3] backing table name
2071  * argv[4] columns
2072  */
2073 /* TODO(shess): Since connect isn't supported, could inline into
2074  * recoverCreate().
2075  */
2076 /* TODO(shess): Explore cases where it would make sense to set *pzErr. */
2077 static int recoverInit(
2078   sqlite3 *db,                        /* Database connection */
2079   void *pAux,                         /* unused */
2080   int argc, const char *const*argv,   /* Parameters to CREATE TABLE statement */
2081   sqlite3_vtab **ppVtab,              /* OUT: New virtual table */
2082   char **pzErr                        /* OUT: Error message, if any */
2083 ){
2084   const unsigned kTypeCol = 4;  /* First argument with column type info. */
2085   Recover *pRecover;            /* Virtual table structure being created. */
2086   char *zDot;                   /* Any dot found in "db.table" backing. */
2087   u32 iRootPage;                /* Root page of backing table. */
2088   char *zCreateSql;             /* Schema of created virtual table. */
2089   int rc;
2090
2091   /* Require to be in the temp database. */
2092   if( ascii_strcasecmp(argv[1], "temp")!=0 ){
2093     *pzErr = sqlite3_mprintf("recover table must be in temp database");
2094     return SQLITE_MISUSE;
2095   }
2096
2097   /* Need the backing table and at least one column. */
2098   if( argc<=kTypeCol ){
2099     *pzErr = sqlite3_mprintf("no columns specified");
2100     return SQLITE_MISUSE;
2101   }
2102
2103   pRecover = sqlite3_malloc(sizeof(Recover));
2104   if( !pRecover ){
2105     return SQLITE_NOMEM;
2106   }
2107   memset(pRecover, 0, sizeof(*pRecover));
2108   pRecover->base.pModule = &recoverModule;
2109   pRecover->db = db;
2110
2111   /* Parse out db.table, assuming main if no dot. */
2112   zDot = strchr(argv[3], '.');
2113   if( !zDot ){
2114     pRecover->zDb = sqlite3_strdup(db->aDb[0].zName);
2115     pRecover->zTable = sqlite3_strdup(argv[3]);
2116   }else if( zDot>argv[3] && zDot[1]!='\0' ){
2117     pRecover->zDb = sqlite3_strndup(argv[3], zDot - argv[3]);
2118     pRecover->zTable = sqlite3_strdup(zDot + 1);
2119   }else{
2120     /* ".table" or "db." not allowed. */
2121     *pzErr = sqlite3_mprintf("ill-formed table specifier");
2122     recoverRelease(pRecover);
2123     return SQLITE_ERROR;
2124   }
2125
2126   pRecover->nCols = argc - kTypeCol;
2127   pRecover->pTypes = sqlite3_malloc(pRecover->nCols);
2128   if( !pRecover->zDb || !pRecover->zTable || !pRecover->pTypes ){
2129     recoverRelease(pRecover);
2130     return SQLITE_NOMEM;
2131   }
2132
2133   /* Require the backing table to exist. */
2134   /* TODO(shess): Be more pedantic about the form of the descriptor
2135    * string.  This already fails for poorly-formed strings, simply
2136    * because there won't be a root page, but it would make more sense
2137    * to be explicit.
2138    */
2139   rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable, &iRootPage);
2140   if( rc!=SQLITE_OK ){
2141     *pzErr = sqlite3_mprintf("unable to find backing table");
2142     recoverRelease(pRecover);
2143     return rc;
2144   }
2145
2146   /* Parse the column definitions. */
2147   rc = ParseColumnsAndGenerateCreate(pRecover->nCols, argv + kTypeCol,
2148                                      &zCreateSql, pRecover->pTypes, pzErr);
2149   if( rc!=SQLITE_OK ){
2150     recoverRelease(pRecover);
2151     return rc;
2152   }
2153
2154   rc = sqlite3_declare_vtab(db, zCreateSql);
2155   sqlite3_free(zCreateSql);
2156   if( rc!=SQLITE_OK ){
2157     recoverRelease(pRecover);
2158     return rc;
2159   }
2160
2161   *ppVtab = (sqlite3_vtab *)pRecover;
2162   return SQLITE_OK;
2163 }