lang/sql/sqlite/test/fts3rnd.test

   1 # 2009 December 03
   2 #
   3 #    May you do good and not evil.
   4 #    May you find forgiveness for yourself and forgive others.
   5 #    May you share freely, never taking more than you give.
   6 #
   7 #***********************************************************************
   8 #
   9 # Brute force (random data) tests for FTS3.
  10 #
  11
  12 #-------------------------------------------------------------------------
  13 #
  14 # The FTS3 tests implemented in this file focus on testing that FTS3
  15 # returns the correct set of documents for various types of full-text
  16 # query. This is done using pseudo-randomly generated data and queries.
  17 # The expected result of each query is calculated using Tcl code.
  18 #
  19 #   1. The database is initialized to contain a single table with three
  20 #      columns. 100 rows are inserted into the table. Each of the three
  21 #      values in each row is a document consisting of between 0 and 100
  22 #      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
  23 #
  24 #   2. The following is performed 100 times:
  25 #
  26 #      a. A row is inserted into the database. The row contents are
  27 #         generated as in step 1. The docid is a pseudo-randomly selected
  28 #         value between 0 and 1000000.
  29 #
  30 #      b. A psuedo-randomly selected row is updated. One of its columns is
  31 #         set to contain a new document generated in the same way as the
  32 #         documents in step 1.
  33 #
  34 #      c. A psuedo-randomly selected row is deleted.
  35 #
  36 #      d. For each of several types of fts3 queries, 10 SELECT queries
  37 #         of the form:
  38 #
  39 #           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
  40 #
  41 #         are evaluated. The results are compared to those calculated by
  42 #         Tcl code in this file. The patterns used for the different query
  43 #         types are:
  44 #
  45 #           1.  query = <term>
  46 #           2.  query = <prefix>
  47 #           3.  query = "<term> <term>"
  48 #           4.  query = "<term> <term> <term>"
  49 #           5.  query = "<prefix> <prefix> <prefix>"
  50 #           6.  query = <term> NEAR <term>
  51 #           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
  52 #           8.  query = <term> OR <term>
  53 #           9.  query = <term> NOT <term>
  54 #           10. query = <term> AND <term>
  55 #           11. query = <term> NEAR <term> OR <term> NEAR <term>
  56 #           12. query = <term> NEAR <term> NOT <term> NEAR <term>
  57 #           13. query = <term> NEAR <term> AND <term> NEAR <term>
  58 #
  59 #         where <term> is a term psuedo-randomly selected from the vocabulary
  60 #         and prefix is the first 2 characters of such a term followed by
  61 #         a "*" character.
  62 #
  63 #      Every second iteration, steps (a) through (d) above are performed
  64 #      within a single transaction. This forces the queries in (d) to
  65 #      read data from both the database and the in-memory hash table
  66 #      that caches the full-text index entries created by steps (a), (b)
  67 #      and (c) until the transaction is committed.
  68 #
  69 # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
  70 # 500, 1000 and 2000 bytes.
  71 #
  72 # After the test using an advisory node-size of 50, an OOM test is run using
  73 # the database. This test is similar to step (d) above, except that it tests
  74 # the effects of transient and persistent OOM conditions encountered while
  75 # executing each query.
  76 #
  77
  78 set testdir [file dirname $argv0]
  79 source $testdir/tester.tcl
  80
  81 # If this build does not include FTS3, skip the tests in this file.
  82 #
  83 ifcapable !fts3 { finish_test ; return }
  84 source $testdir/fts3_common.tcl
  85 source $testdir/malloc_common.tcl
  86
  87 set G(nVocab) 100
  88
  89 set nVocab 100
  90 set lVocab [list]
  91
  92 expr srand(0)
  93
  94 # Generate a vocabulary of nVocab words. Each word is 3 characters long.
  95 #
  96 set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
  97 for {set i 0} {$i < $nVocab} {incr i} {
  98   set len [expr int(rand()*3)+2]
  99   set    word [lindex $lChar [expr int(rand()*26)]]
 100   append word [lindex $lChar [expr int(rand()*26)]]
 101   if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
 102   if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
 103   lappend lVocab $word
 104 }
 105
 106 proc random_term {} {
 107   lindex $::lVocab [expr {int(rand()*$::nVocab)}]
 108 }
 109
 110 # Return a document consisting of $nWord arbitrarily selected terms
 111 # from the $::lVocab list.
 112 #
 113 proc generate_doc {nWord} {
 114   set doc [list]
 115   for {set i 0} {$i < $nWord} {incr i} {
 116     lappend doc [random_term]
 117   }
 118   return $doc
 119 }
 120
 121
 122
 123 # Primitives to update the table.
 124 #
 125 unset -nocomplain t1
 126 proc insert_row {rowid} {
 127   set a [generate_doc [expr int((rand()*100))]]
 128   set b [generate_doc [expr int((rand()*100))]]
 129   set c [generate_doc [expr int((rand()*100))]]
 130   execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
 131   set ::t1($rowid) [list $a $b $c]
 132 }
 133 proc delete_row {rowid} {
 134   execsql { DELETE FROM t1 WHERE rowid = $rowid }
 135   catch {unset ::t1($rowid)}
 136 }
 137 proc update_row {rowid} {
 138   set cols {a b c}
 139   set iCol [expr int(rand()*3)]
 140   set doc  [generate_doc [expr int((rand()*100))]]
 141   lset ::t1($rowid) $iCol $doc
 142   execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
 143 }
 144
 145 proc simple_phrase {zPrefix} {
 146   set ret [list]
 147
 148   set reg [string map {* {[^ ]*}} $zPrefix]
 149   set reg " $reg "
 150
 151   foreach key [lsort -integer [array names ::t1]] {
 152     set value $::t1($key)
 153     set cnt [list]
 154     foreach col $value {
 155       if {[regexp $reg " $col "]} { lappend ret $key ; break }
 156     }
 157   }
 158
 159   #lsort -uniq -integer $ret
 160   set ret
 161 }
 162
 163 # This [proc] is used to test the FTS3 matchinfo() function.
 164 #
 165 proc simple_token_matchinfo {zToken} {
 166
 167   set nDoc(0) 0
 168   set nDoc(1) 0
 169   set nDoc(2) 0
 170   set nHit(0) 0
 171   set nHit(1) 0
 172   set nHit(2) 0
 173
 174
 175   foreach key [array names ::t1] {
 176     set value $::t1($key)
 177     set a($key) [list]
 178     foreach i {0 1 2} col $value {
 179       set hit [llength [lsearch -all $col $zToken]]
 180       lappend a($key) $hit
 181       incr nHit($i) $hit
 182       if {$hit>0} { incr nDoc($i) }
 183     }
 184   }
 185
 186   set ret [list]
 187   foreach docid [lsort -integer [array names a]] {
 188     if { [lindex [lsort -integer $a($docid)] end] } {
 189       set matchinfo [list 1 3]
 190       foreach i {0 1 2} hit $a($docid) {
 191         lappend matchinfo $hit $nHit($i) $nDoc($i)
 192       }
 193       lappend ret $docid $matchinfo
 194     }
 195   }
 196
 197   set ret
 198 }
 199
 200 proc simple_near {termlist nNear} {
 201   set ret [list]
 202
 203   foreach {key value} [array get ::t1] {
 204     foreach v $value {
 205
 206       set l [lsearch -exact -all $v [lindex $termlist 0]]
 207       foreach T [lrange $termlist 1 end] {
 208         set l2 [list]
 209         foreach i $l {
 210           set iStart [expr $i - $nNear - 1]
 211           set iEnd [expr $i + $nNear + 1]
 212           if {$iStart < 0} {set iStart 0}
 213           foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
 214             incr i2 $iStart
 215             if {$i2 != $i} { lappend l2 $i2 }
 216           }
 217         }
 218         set l [lsort -uniq -integer $l2]
 219       }
 220
 221       if {[llength $l]} {
 222 #puts "MATCH($key): $v"
 223         lappend ret $key
 224       }
 225     }
 226   }
 227
 228   lsort -unique -integer $ret
 229 }
 230
 231 # The following three procs:
 232 #
 233 #   setup_not A B
 234 #   setup_or  A B
 235 #   setup_and A B
 236 #
 237 # each take two arguments. Both arguments must be lists of integer values
 238 # sorted by value. The return value is the list produced by evaluating
 239 # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
 240 # AND.
 241 #
 242 proc setop_not {A B} {
 243   foreach b $B { set n($b) {} }
 244   set ret [list]
 245   foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
 246   return $ret
 247 }
 248 proc setop_or {A B} {
 249   lsort -integer -uniq [concat $A $B]
 250 }
 251 proc setop_and {A B} {
 252   foreach b $B { set n($b) {} }
 253   set ret [list]
 254   foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
 255   return $ret
 256 }
 257
 258 proc mit {blob} {
 259   set scan(littleEndian) i*
 260   set scan(bigEndian) I*
 261   binary scan $blob $scan($::tcl_platform(byteOrder)) r
 262   return $r
 263 }
 264 db func mit mit
 265
 266 set sqlite_fts3_enable_parentheses 1
 267
 268 foreach nodesize {50 500 1000 2000} {
 269   catch { array unset ::t1 }
 270
 271   # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
 272   #
 273   db transaction {
 274     catchsql { DROP TABLE t1 }
 275     execsql "CREATE VIRTUAL TABLE t1 USING fts3(a, b, c)"
 276     execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
 277     for {set i 0} {$i < 100} {incr i} { insert_row $i }
 278   }
 279
 280   for {set iTest 1} {$iTest <= 100} {incr iTest} {
 281     catchsql COMMIT
 282
 283     set DO_MALLOC_TEST 0
 284     set nRep 10
 285     if {$iTest==100 && $nodesize==50} {
 286       set DO_MALLOC_TEST 1
 287       set nRep 2
 288     }
 289
 290     # Delete one row, update one row and insert one row.
 291     #
 292     set rows [array names ::t1]
 293     set nRow [llength $rows]
 294     set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
 295     set iDelete $iUpdate
 296     while {$iDelete == $iUpdate} {
 297       set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
 298     }
 299     set iInsert $iUpdate
 300     while {[info exists ::t1($iInsert)]} {
 301       set iInsert [expr {int(rand()*1000000)}]
 302     }
 303     execsql BEGIN
 304       insert_row $iInsert
 305       update_row $iUpdate
 306       delete_row $iDelete
 307     if {0==($iTest%2)} { execsql COMMIT }
 308
 309     if {0==($iTest%2)} {
 310       do_test fts3rnd-1.$nodesize.$iTest.0 { fts3_integrity_check t1 } ok
 311     }
 312
 313     # Pick 10 terms from the vocabulary. Check that the results of querying
 314     # the database for the set of documents containing each of these terms
 315     # is the same as the result obtained by scanning the contents of the Tcl
 316     # array for each term.
 317     #
 318     for {set i 0} {$i < 10} {incr i} {
 319       set term [random_term]
 320       do_select_test fts3rnd-1.$nodesize.$iTest.1.$i {
 321         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
 322       } [simple_token_matchinfo $term]
 323     }
 324
 325     # This time, use the first two characters of each term as a term prefix
 326     # to query for. Test that querying the Tcl array produces the same results
 327     # as querying the FTS3 table for the prefix.
 328     #
 329     for {set i 0} {$i < $nRep} {incr i} {
 330       set prefix [string range [random_term] 0 end-1]
 331       set match "${prefix}*"
 332       do_select_test fts3rnd-1.$nodesize.$iTest.2.$i {
 333         SELECT docid FROM t1 WHERE t1 MATCH $match
 334       } [simple_phrase $match]
 335     }
 336
 337     # Similar to the above, except for phrase queries.
 338     #
 339     for {set i 0} {$i < $nRep} {incr i} {
 340       set term [list [random_term] [random_term]]
 341       set match "\"$term\""
 342       do_select_test fts3rnd-1.$nodesize.$iTest.3.$i {
 343         SELECT docid FROM t1 WHERE t1 MATCH $match
 344       } [simple_phrase $term]
 345     }
 346
 347     # Three word phrases.
 348     #
 349     for {set i 0} {$i < $nRep} {incr i} {
 350       set term [list [random_term] [random_term] [random_term]]
 351       set match "\"$term\""
 352       do_select_test fts3rnd-1.$nodesize.$iTest.4.$i {
 353         SELECT docid FROM t1 WHERE t1 MATCH $match
 354       } [simple_phrase $term]
 355     }
 356
 357     # Three word phrases made up of term-prefixes.
 358     #
 359     for {set i 0} {$i < $nRep} {incr i} {
 360       set    query "[string range [random_term] 0 end-1]* "
 361       append query "[string range [random_term] 0 end-1]* "
 362       append query "[string range [random_term] 0 end-1]*"
 363
 364       set match "\"$query\""
 365       do_select_test fts3rnd-1.$nodesize.$iTest.5.$i {
 366         SELECT docid FROM t1 WHERE t1 MATCH $match
 367       } [simple_phrase $query]
 368     }
 369
 370     # A NEAR query with terms as the arguments.
 371     #
 372     for {set i 0} {$i < $nRep} {incr i} {
 373       set terms [list [random_term] [random_term]]
 374       set match [join $terms " NEAR "]
 375       do_select_test fts3rnd-1.$nodesize.$iTest.6.$i {
 376         SELECT docid FROM t1 WHERE t1 MATCH $match
 377       } [simple_near $terms 10]
 378     }
 379
 380     # A 3-way NEAR query with terms as the arguments.
 381     #
 382     for {set i 0} {$i < $nRep} {incr i} {
 383       set terms [list [random_term] [random_term] [random_term]]
 384       set nNear 11
 385       set match [join $terms " NEAR/$nNear "]
 386       do_select_test fts3rnd-1.$nodesize.$iTest.7.$i {
 387         SELECT docid FROM t1 WHERE t1 MATCH $match
 388       } [simple_near $terms $nNear]
 389     }
 390
 391     # Set operations on simple term queries.
 392     #
 393     foreach {tn op proc} {
 394       8  OR  setop_or
 395       9  NOT setop_not
 396       10 AND setop_and
 397     } {
 398       for {set i 0} {$i < $nRep} {incr i} {
 399         set term1 [random_term]
 400         set term2 [random_term]
 401         set match "$term1 $op $term2"
 402         do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i {
 403           SELECT docid FROM t1 WHERE t1 MATCH $match
 404         } [$proc [simple_phrase $term1] [simple_phrase $term2]]
 405       }
 406     }
 407
 408     # Set operations on NEAR queries.
 409     #
 410     foreach {tn op proc} {
 411       8  OR  setop_or
 412       9  NOT setop_not
 413       10 AND setop_and
 414     } {
 415       for {set i 0} {$i < $nRep} {incr i} {
 416         set term1 [random_term]
 417         set term2 [random_term]
 418         set term3 [random_term]
 419         set term4 [random_term]
 420         set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
 421         do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i {
 422           SELECT docid FROM t1 WHERE t1 MATCH $match
 423         } [$proc                                  \
 424             [simple_near [list $term1 $term2] 10] \
 425             [simple_near [list $term3 $term4] 10]
 426           ]
 427       }
 428     }
 429
 430     catchsql COMMIT
 431   }
 432 }
 433
 434 finish_test