doc/index.py

   1 #!/usr/bin/python -u
   2 #
   3 # imports the API description and fills up a database with
   4 # name relevance to modules, functions or web pages
   5 #
   6 # Operation needed:
   7 # =================
   8 #
   9 # install mysqld, the python wrappers for mysql and libxml2, start mysqld
  10 # Change the root passwd of mysql:
  11 #    mysqladmin -u root password new_password
  12 # Create the new database xmlsoft
  13 #    mysqladmin -p create xmlsoft
  14 # Create a database user 'veillard' and give him passord access
  15 # change veillard and abcde with the right user name and passwd
  16 #    mysql -p
  17 #    password:
  18 #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
  19 #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
  20 #
  21 # As the user check the access:
  22 #    mysql -p xmlsoft
  23 #    Enter password:
  24 #    Welcome to the MySQL monitor....
  25 #    mysql> use xmlsoft
  26 #    Database changed
  27 #    mysql> quit
  28 #    Bye
  29 #
  30 # Then run the script in the doc subdir, it will create the symbols and
  31 # word tables and populate them with informations extracted from
  32 # the libxml2-api.xml API description, and make then accessible read-only
  33 # by nobody@loaclhost the user expected to be Apache's one
  34 #
  35 # On the Apache configuration, make sure you have php support enabled
  36 #
  37
  38 import MySQLdb
  39 import libxml2
  40 import sys
  41 import string
  42 import os
  43
  44 #
  45 # We are not interested in parsing errors here
  46 #
  47 def callback(ctx, str):
  48     return
  49 libxml2.registerErrorHandler(callback, None)
  50
  51 #
  52 # The dictionnary of tables required and the SQL command needed
  53 # to create them
  54 #
  55 TABLES={
  56   "symbols" : """CREATE TABLE symbols (
  57            name varchar(255) BINARY NOT NULL,
  58            module varchar(255) BINARY NOT NULL,
  59            type varchar(25) NOT NULL,
  60            descr varchar(255),
  61            UNIQUE KEY name (name),
  62            KEY module (module))""",
  63   "words" : """CREATE TABLE words (
  64            name varchar(50) BINARY NOT NULL,
  65            symbol varchar(255) BINARY NOT NULL,
  66            relevance int,
  67            KEY name (name),
  68            KEY symbol (symbol),
  69            UNIQUE KEY ID (name, symbol))""",
  70   "wordsHTML" : """CREATE TABLE wordsHTML (
  71            name varchar(50) BINARY NOT NULL,
  72            resource varchar(255) BINARY NOT NULL,
  73            section varchar(255),
  74            id varchar(50),
  75            relevance int,
  76            KEY name (name),
  77            KEY resource (resource),
  78            UNIQUE KEY ref (name, resource))""",
  79   "wordsArchive" : """CREATE TABLE wordsArchive (
  80            name varchar(50) BINARY NOT NULL,
  81            ID int(11) NOT NULL,
  82            relevance int,
  83            KEY name (name),
  84            UNIQUE KEY ref (name, ID))""",
  85   "pages" : """CREATE TABLE pages (
  86            resource varchar(255) BINARY NOT NULL,
  87            title varchar(255) BINARY NOT NULL,
  88            UNIQUE KEY name (resource))""",
  89   "archives" : """CREATE TABLE archives (
  90            ID int(11) NOT NULL auto_increment,
  91            resource varchar(255) BINARY NOT NULL,
  92            title varchar(255) BINARY NOT NULL,
  93            UNIQUE KEY id (ID,resource(255)),
  94            INDEX (ID),
  95            INDEX (resource))""",
  96   "Queries" : """CREATE TABLE Queries (
  97            ID int(11) NOT NULL auto_increment,
  98            Value varchar(50) NOT NULL,
  99            Count int(11) NOT NULL,
 100            UNIQUE KEY id (ID,Value(35)),
 101            INDEX (ID))""",
 102   "AllQueries" : """CREATE TABLE AllQueries (
 103            ID int(11) NOT NULL auto_increment,
 104            Value varchar(50) NOT NULL,
 105            Count int(11) NOT NULL,
 106            UNIQUE KEY id (ID,Value(35)),
 107            INDEX (ID))""",
 108 }
 109
 110 #
 111 # The XML API description file to parse
 112 #
 113 API="libxml2-api.xml"
 114 DB=None
 115
 116 #########################################################################
 117 #                                                                       #
 118 #                  MySQL database interfaces                            #
 119 #                                                                       #
 120 #########################################################################
 121 def createTable(db, name):
 122     global TABLES
 123
 124     if db == None:
 125         return -1
 126     if name == None:
 127         return -1
 128     c = db.cursor()
 129
 130     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
 131     if ret == 1:
 132         print "Removed table %s" % (name)
 133     print "Creating table %s" % (name)
 134     try:
 135         ret = c.execute(TABLES[name])
 136     except:
 137         print "Failed to create table %s" % (name)
 138         return -1
 139     return ret
 140
 141 def checkTables(db, verbose = 1):
 142     global TABLES
 143
 144     if db == None:
 145         return -1
 146     c = db.cursor()
 147     nbtables = c.execute("show tables")
 148     if verbose:
 149         print "Found %d tables" % (nbtables)
 150     tables = {}
 151     i = 0
 152     while i < nbtables:
 153         l = c.fetchone()
 154         name = l[0]
 155         tables[name] = {}
 156         i = i + 1
 157
 158     for table in TABLES.keys():
 159         if not tables.has_key(table):
 160             print "table %s missing" % (table)
 161             createTable(db, table)
 162         try:
 163             ret = c.execute("SELECT count(*) from %s" % table);
 164             row = c.fetchone()
 165             if verbose:
 166                 print "Table %s contains %d records" % (table, row[0])
 167         except:
 168             print "Troubles with table %s : repairing" % (table)
 169             ret = c.execute("repair table %s" % table);
 170             print "repairing returned %d" % (ret)
 171             ret = c.execute("SELECT count(*) from %s" % table);
 172             row = c.fetchone()
 173             print "Table %s contains %d records" % (table, row[0])
 174     if verbose:
 175         print "checkTables finished"
 176
 177     # make sure apache can access the tables read-only
 178     try:
 179         ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
 180         ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
 181     except:
 182         pass
 183     return 0
 184
 185 def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
 186     global DB
 187
 188     if passwd == None:
 189         try:
 190             passwd = os.environ["MySQL_PASS"]
 191         except:
 192             print "No password available, set environment MySQL_PASS"
 193             sys.exit(1)
 194
 195     DB = MySQLdb.connect(passwd=passwd, db=db)
 196     if DB == None:
 197         return -1
 198     ret = checkTables(DB, verbose)
 199     return ret
 200
 201 def updateWord(name, symbol, relevance):
 202     global DB
 203
 204     if DB == None:
 205         openMySQL()
 206     if DB == None:
 207         return -1
 208     if name == None:
 209         return -1
 210     if symbol == None:
 211         return -1
 212
 213     c = DB.cursor()
 214     try:
 215         ret = c.execute(
 216 """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
 217                 (name, symbol, relevance))
 218     except:
 219         try:
 220             ret = c.execute(
 221     """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
 222                     (relevance, name, symbol))
 223         except:
 224             print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
 225             print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
 226             print sys.exc_type, sys.exc_value
 227             return -1
 228
 229     return ret
 230
 231 def updateSymbol(name, module, type, desc):
 232     global DB
 233
 234     updateWord(name, name, 50)
 235     if DB == None:
 236         openMySQL()
 237     if DB == None:
 238         return -1
 239     if name == None:
 240         return -1
 241     if module == None:
 242         return -1
 243     if type == None:
 244         return -1
 245
 246     try:
 247         desc = string.replace(desc, "'", " ")
 248         l = string.split(desc, ".")
 249         desc = l[0]
 250         desc = desc[0:99]
 251     except:
 252         desc = ""
 253
 254     c = DB.cursor()
 255     try:
 256         ret = c.execute(
 257 """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
 258                     (name, module, type, desc))
 259     except:
 260         try:
 261             ret = c.execute(
 262 """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
 263                     (module, type, desc, name))
 264         except:
 265             print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 266             print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
 267             print sys.exc_type, sys.exc_value
 268             return -1
 269
 270     return ret
 271
 272 def addFunction(name, module, desc = ""):
 273     return updateSymbol(name, module, 'function', desc)
 274
 275 def addMacro(name, module, desc = ""):
 276     return updateSymbol(name, module, 'macro', desc)
 277
 278 def addEnum(name, module, desc = ""):
 279     return updateSymbol(name, module, 'enum', desc)
 280
 281 def addStruct(name, module, desc = ""):
 282     return updateSymbol(name, module, 'struct', desc)
 283
 284 def addConst(name, module, desc = ""):
 285     return updateSymbol(name, module, 'const', desc)
 286
 287 def addType(name, module, desc = ""):
 288     return updateSymbol(name, module, 'type', desc)
 289
 290 def addFunctype(name, module, desc = ""):
 291     return updateSymbol(name, module, 'functype', desc)
 292
 293 def addPage(resource, title):
 294     global DB
 295
 296     if DB == None:
 297         openMySQL()
 298     if DB == None:
 299         return -1
 300     if resource == None:
 301         return -1
 302
 303     c = DB.cursor()
 304     try:
 305         ret = c.execute(
 306             """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
 307                     (resource, title))
 308     except:
 309         try:
 310             ret = c.execute(
 311                 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
 312                     (title, resource))
 313         except:
 314             print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 315             print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
 316             print sys.exc_type, sys.exc_value
 317             return -1
 318
 319     return ret
 320
 321 def updateWordHTML(name, resource, desc, id, relevance):
 322     global DB
 323
 324     if DB == None:
 325         openMySQL()
 326     if DB == None:
 327         return -1
 328     if name == None:
 329         return -1
 330     if resource == None:
 331         return -1
 332     if id == None:
 333         id = ""
 334     if desc == None:
 335         desc = ""
 336     else:
 337         try:
 338             desc = string.replace(desc, "'", " ")
 339             desc = desc[0:99]
 340         except:
 341             desc = ""
 342
 343     c = DB.cursor()
 344     try:
 345         ret = c.execute(
 346 """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
 347                     (name, resource, desc, id, relevance))
 348     except:
 349         try:
 350             ret = c.execute(
 351 """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
 352                     (desc, id, relevance, name, resource))
 353         except:
 354             print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
 355             print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
 356             print sys.exc_type, sys.exc_value
 357             return -1
 358
 359     return ret
 360
 361 def checkXMLMsgArchive(url):
 362     global DB
 363
 364     if DB == None:
 365         openMySQL()
 366     if DB == None:
 367         return -1
 368     if url == None:
 369         return -1
 370
 371     c = DB.cursor()
 372     try:
 373         ret = c.execute(
 374             """SELECT ID FROM archives WHERE resource='%s'""" % (url))
 375         row = c.fetchone()
 376         if row == None:
 377             return -1
 378     except:
 379         return -1
 380
 381     return row[0]
 382
 383 def addXMLMsgArchive(url, title):
 384     global DB
 385
 386     if DB == None:
 387         openMySQL()
 388     if DB == None:
 389         return -1
 390     if url == None:
 391         return -1
 392     if title == None:
 393         title = ""
 394     else:
 395         title = string.replace(title, "'", " ")
 396         title = title[0:99]
 397
 398     c = DB.cursor()
 399     try:
 400         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
 401         ret = c.execute(cmd)
 402         cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
 403         ret = c.execute(cmd)
 404         row = c.fetchone()
 405         if row == None:
 406             print "addXMLMsgArchive failed to get the ID: %s" % (url)
 407             return -1
 408     except:
 409         print "addXMLMsgArchive failed command: %s" % (cmd)
 410         return -1
 411
 412     return((int)(row[0]))
 413
 414 def updateWordArchive(name, id, relevance):
 415     global DB
 416
 417     if DB == None:
 418         openMySQL()
 419     if DB == None:
 420         return -1
 421     if name == None:
 422         return -1
 423     if id == None:
 424         return -1
 425
 426     c = DB.cursor()
 427     try:
 428         ret = c.execute(
 429 """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
 430                     (name, id, relevance))
 431     except:
 432         try:
 433             ret = c.execute(
 434 """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
 435                     (relevance, name, id))
 436         except:
 437             print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
 438             print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
 439             print sys.exc_type, sys.exc_value
 440             return -1
 441
 442     return ret
 443
 444 #########################################################################
 445 #                                                                       #
 446 #                  Word dictionnary and analysis routines               #
 447 #                                                                       #
 448 #########################################################################
 449
 450 #
 451 # top 100 english word without the one len < 3 + own set
 452 #
 453 dropWords = {
 454     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
 455     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
 456     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
 457     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
 458     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
 459     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
 460     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
 461     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
 462     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
 463     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
 464     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
 465     'down':0,
 466     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
 467 }
 468
 469 wordsDict = {}
 470 wordsDictHTML = {}
 471 wordsDictArchive = {}
 472
 473 def cleanupWordsString(str):
 474     str = string.replace(str, ".", " ")
 475     str = string.replace(str, "!", " ")
 476     str = string.replace(str, "?", " ")
 477     str = string.replace(str, ",", " ")
 478     str = string.replace(str, "'", " ")
 479     str = string.replace(str, '"', " ")
 480     str = string.replace(str, ";", " ")
 481     str = string.replace(str, "(", " ")
 482     str = string.replace(str, ")", " ")
 483     str = string.replace(str, "{", " ")
 484     str = string.replace(str, "}", " ")
 485     str = string.replace(str, "<", " ")
 486     str = string.replace(str, ">", " ")
 487     str = string.replace(str, "=", " ")
 488     str = string.replace(str, "/", " ")
 489     str = string.replace(str, "*", " ")
 490     str = string.replace(str, ":", " ")
 491     str = string.replace(str, "#", " ")
 492     str = string.replace(str, "\\", " ")
 493     str = string.replace(str, "\n", " ")
 494     str = string.replace(str, "\r", " ")
 495     str = string.replace(str, "\xc2", " ")
 496     str = string.replace(str, "\xa0", " ")
 497     return str
 498
 499 def cleanupDescrString(str):
 500     str = string.replace(str, "'", " ")
 501     str = string.replace(str, "\n", " ")
 502     str = string.replace(str, "\r", " ")
 503     str = string.replace(str, "\xc2", " ")
 504     str = string.replace(str, "\xa0", " ")
 505     l = string.split(str)
 506     str = string.join(str)
 507     return str
 508
 509 def splitIdentifier(str):
 510     ret = []
 511     while str != "":
 512         cur = string.lower(str[0])
 513         str = str[1:]
 514         if ((cur < 'a') or (cur > 'z')):
 515             continue
 516         while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
 517             cur = cur + string.lower(str[0])
 518             str = str[1:]
 519         while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
 520             cur = cur + str[0]
 521             str = str[1:]
 522         while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
 523             str = str[1:]
 524         ret.append(cur)
 525     return ret
 526
 527 def addWord(word, module, symbol, relevance):
 528     global wordsDict
 529
 530     if word == None or len(word) < 3:
 531         return -1
 532     if module == None or symbol == None:
 533         return -1
 534     if dropWords.has_key(word):
 535         return 0
 536     if ord(word[0]) > 0x80:
 537         return 0
 538
 539     if wordsDict.has_key(word):
 540         d = wordsDict[word]
 541         if d == None:
 542             return 0
 543         if len(d) > 500:
 544             wordsDict[word] = None
 545             return 0
 546         try:
 547             relevance = relevance + d[(module, symbol)]
 548         except:
 549             pass
 550     else:
 551         wordsDict[word] = {}
 552     wordsDict[word][(module, symbol)] = relevance
 553     return relevance
 554
 555 def addString(str, module, symbol, relevance):
 556     if str == None or len(str) < 3:
 557         return -1
 558     ret = 0
 559     str = cleanupWordsString(str)
 560     l = string.split(str)
 561     for word in l:
 562         if len(word) > 2:
 563             ret = ret + addWord(word, module, symbol, 5)
 564
 565     return ret
 566
 567 def addWordHTML(word, resource, id, section, relevance):
 568     global wordsDictHTML
 569
 570     if word == None or len(word) < 3:
 571         return -1
 572     if resource == None or section == None:
 573         return -1
 574     if dropWords.has_key(word):
 575         return 0
 576     if ord(word[0]) > 0x80:
 577         return 0
 578
 579     section = cleanupDescrString(section)
 580
 581     if wordsDictHTML.has_key(word):
 582         d = wordsDictHTML[word]
 583         if d == None:
 584             print "skipped %s" % (word)
 585             return 0
 586         try:
 587             (r,i,s) = d[resource]
 588             if i != None:
 589                 id = i
 590             if s != None:
 591                 section = s
 592             relevance = relevance + r
 593         except:
 594             pass
 595     else:
 596         wordsDictHTML[word] = {}
 597     d = wordsDictHTML[word];
 598     d[resource] = (relevance, id, section)
 599     return relevance
 600
 601 def addStringHTML(str, resource, id, section, relevance):
 602     if str == None or len(str) < 3:
 603         return -1
 604     ret = 0
 605     str = cleanupWordsString(str)
 606     l = string.split(str)
 607     for word in l:
 608         if len(word) > 2:
 609             try:
 610                 r = addWordHTML(word, resource, id, section, relevance)
 611                 if r < 0:
 612                     print "addWordHTML failed: %s %s" % (word, resource)
 613                 ret = ret + r
 614             except:
 615                 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
 616                 print sys.exc_type, sys.exc_value
 617
 618     return ret
 619
 620 def addWordArchive(word, id, relevance):
 621     global wordsDictArchive
 622
 623     if word == None or len(word) < 3:
 624         return -1
 625     if id == None or id == -1:
 626         return -1
 627     if dropWords.has_key(word):
 628         return 0
 629     if ord(word[0]) > 0x80:
 630         return 0
 631
 632     if wordsDictArchive.has_key(word):
 633         d = wordsDictArchive[word]
 634         if d == None:
 635             print "skipped %s" % (word)
 636             return 0
 637         try:
 638             r = d[id]
 639             relevance = relevance + r
 640         except:
 641             pass
 642     else:
 643         wordsDictArchive[word] = {}
 644     d = wordsDictArchive[word];
 645     d[id] = relevance
 646     return relevance
 647
 648 def addStringArchive(str, id, relevance):
 649     if str == None or len(str) < 3:
 650         return -1
 651     ret = 0
 652     str = cleanupWordsString(str)
 653     l = string.split(str)
 654     for word in l:
 655         i = len(word)
 656         if i > 2:
 657             try:
 658                 r = addWordArchive(word, id, relevance)
 659                 if r < 0:
 660                     print "addWordArchive failed: %s %s" % (word, id)
 661                 else:
 662                     ret = ret + r
 663             except:
 664                 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
 665                 print sys.exc_type, sys.exc_value
 666     return ret
 667
 668 #########################################################################
 669 #                                                                       #
 670 #                  XML API description analysis                         #
 671 #                                                                       #
 672 #########################################################################
 673
 674 def loadAPI(filename):
 675     doc = libxml2.parseFile(filename)
 676     print "loaded %s" % (filename)
 677     return doc
 678
 679 def foundExport(file, symbol):
 680     if file == None:
 681         return 0
 682     if symbol == None:
 683         return 0
 684     addFunction(symbol, file)
 685     l = splitIdentifier(symbol)
 686     for word in l:
 687         addWord(word, file, symbol, 10)
 688     return 1
 689
 690 def analyzeAPIFile(top):
 691     count = 0
 692     name = top.prop("name")
 693     cur = top.children
 694     while cur != None:
 695         if cur.type == 'text':
 696             cur = cur.next
 697             continue
 698         if cur.name == "exports":
 699             count = count + foundExport(name, cur.prop("symbol"))
 700         else:
 701             print "unexpected element %s in API doc <file name='%s'>" % (name)
 702         cur = cur.next
 703     return count
 704
 705 def analyzeAPIFiles(top):
 706     count = 0
 707     cur = top.children
 708
 709     while cur != None:
 710         if cur.type == 'text':
 711             cur = cur.next
 712             continue
 713         if cur.name == "file":
 714             count = count + analyzeAPIFile(cur)
 715         else:
 716             print "unexpected element %s in API doc <files>" % (cur.name)
 717         cur = cur.next
 718     return count
 719
 720 def analyzeAPIEnum(top):
 721     file = top.prop("file")
 722     if file == None:
 723         return 0
 724     symbol = top.prop("name")
 725     if symbol == None:
 726         return 0
 727
 728     addEnum(symbol, file)
 729     l = splitIdentifier(symbol)
 730     for word in l:
 731         addWord(word, file, symbol, 10)
 732
 733     return 1
 734
 735 def analyzeAPIConst(top):
 736     file = top.prop("file")
 737     if file == None:
 738         return 0
 739     symbol = top.prop("name")
 740     if symbol == None:
 741         return 0
 742
 743     addConst(symbol, file)
 744     l = splitIdentifier(symbol)
 745     for word in l:
 746         addWord(word, file, symbol, 10)
 747
 748     return 1
 749
 750 def analyzeAPIType(top):
 751     file = top.prop("file")
 752     if file == None:
 753         return 0
 754     symbol = top.prop("name")
 755     if symbol == None:
 756         return 0
 757
 758     addType(symbol, file)
 759     l = splitIdentifier(symbol)
 760     for word in l:
 761         addWord(word, file, symbol, 10)
 762     return 1
 763
 764 def analyzeAPIFunctype(top):
 765     file = top.prop("file")
 766     if file == None:
 767         return 0
 768     symbol = top.prop("name")
 769     if symbol == None:
 770         return 0
 771
 772     addFunctype(symbol, file)
 773     l = splitIdentifier(symbol)
 774     for word in l:
 775         addWord(word, file, symbol, 10)
 776     return 1
 777
 778 def analyzeAPIStruct(top):
 779     file = top.prop("file")
 780     if file == None:
 781         return 0
 782     symbol = top.prop("name")
 783     if symbol == None:
 784         return 0
 785
 786     addStruct(symbol, file)
 787     l = splitIdentifier(symbol)
 788     for word in l:
 789         addWord(word, file, symbol, 10)
 790
 791     info = top.prop("info")
 792     if info != None:
 793         info = string.replace(info, "'", " ")
 794         info = string.strip(info)
 795         l = string.split(info)
 796         for word in l:
 797             if len(word) > 2:
 798                 addWord(word, file, symbol, 5)
 799     return 1
 800
 801 def analyzeAPIMacro(top):
 802     file = top.prop("file")
 803     if file == None:
 804         return 0
 805     symbol = top.prop("name")
 806     if symbol == None:
 807         return 0
 808     symbol = string.replace(symbol, "'", " ")
 809     symbol = string.strip(symbol)
 810
 811     info = None
 812     cur = top.children
 813     while cur != None:
 814         if cur.type == 'text':
 815             cur = cur.next
 816             continue
 817         if cur.name == "info":
 818             info = cur.content
 819             break
 820         cur = cur.next
 821
 822     l = splitIdentifier(symbol)
 823     for word in l:
 824         addWord(word, file, symbol, 10)
 825
 826     if info == None:
 827         addMacro(symbol, file)
 828         print "Macro %s description has no <info>" % (symbol)
 829         return 0
 830
 831     info = string.replace(info, "'", " ")
 832     info = string.strip(info)
 833     addMacro(symbol, file, info)
 834     l = string.split(info)
 835     for word in l:
 836         if len(word) > 2:
 837             addWord(word, file, symbol, 5)
 838     return 1
 839
 840 def analyzeAPIFunction(top):
 841     file = top.prop("file")
 842     if file == None:
 843         return 0
 844     symbol = top.prop("name")
 845     if symbol == None:
 846         return 0
 847
 848     symbol = string.replace(symbol, "'", " ")
 849     symbol = string.strip(symbol)
 850     info = None
 851     cur = top.children
 852     while cur != None:
 853         if cur.type == 'text':
 854             cur = cur.next
 855             continue
 856         if cur.name == "info":
 857             info = cur.content
 858         elif cur.name == "return":
 859             rinfo = cur.prop("info")
 860             if rinfo != None:
 861                 rinfo = string.replace(rinfo, "'", " ")
 862                 rinfo = string.strip(rinfo)
 863                 addString(rinfo, file, symbol, 7)
 864         elif cur.name == "arg":
 865             ainfo = cur.prop("info")
 866             if ainfo != None:
 867                 ainfo = string.replace(ainfo, "'", " ")
 868                 ainfo = string.strip(ainfo)
 869                 addString(ainfo, file, symbol, 5)
 870             name = cur.prop("name")
 871             if name != None:
 872                 name = string.replace(name, "'", " ")
 873                 name = string.strip(name)
 874                 addWord(name, file, symbol, 7)
 875         cur = cur.next
 876     if info == None:
 877         print "Function %s description has no <info>" % (symbol)
 878         addFunction(symbol, file, "")
 879     else:
 880         info = string.replace(info, "'", " ")
 881         info = string.strip(info)
 882         addFunction(symbol, file, info)
 883         addString(info, file, symbol, 5)
 884
 885     l = splitIdentifier(symbol)
 886     for word in l:
 887         addWord(word, file, symbol, 10)
 888
 889     return 1
 890
 891 def analyzeAPISymbols(top):
 892     count = 0
 893     cur = top.children
 894
 895     while cur != None:
 896         if cur.type == 'text':
 897             cur = cur.next
 898             continue
 899         if cur.name == "macro":
 900             count = count + analyzeAPIMacro(cur)
 901         elif cur.name == "function":
 902             count = count + analyzeAPIFunction(cur)
 903         elif cur.name == "const":
 904             count = count + analyzeAPIConst(cur)
 905         elif cur.name == "typedef":
 906             count = count + analyzeAPIType(cur)
 907         elif cur.name == "struct":
 908             count = count + analyzeAPIStruct(cur)
 909         elif cur.name == "enum":
 910             count = count + analyzeAPIEnum(cur)
 911         elif cur.name == "functype":
 912             count = count + analyzeAPIFunctype(cur)
 913         else:
 914             print "unexpected element %s in API doc <files>" % (cur.name)
 915         cur = cur.next
 916     return count
 917
 918 def analyzeAPI(doc):
 919     count = 0
 920     if doc == None:
 921         return -1
 922     root = doc.getRootElement()
 923     if root.name != "api":
 924         print "Unexpected root name"
 925         return -1
 926     cur = root.children
 927     while cur != None:
 928         if cur.type == 'text':
 929             cur = cur.next
 930             continue
 931         if cur.name == "files":
 932             pass
 933 #           count = count + analyzeAPIFiles(cur)
 934         elif cur.name == "symbols":
 935             count = count + analyzeAPISymbols(cur)
 936         else:
 937             print "unexpected element %s in API doc" % (cur.name)
 938         cur = cur.next
 939     return count
 940
 941 #########################################################################
 942 #                                                                       #
 943 #                  Web pages parsing and analysis                       #
 944 #                                                                       #
 945 #########################################################################
 946
 947 import glob
 948
 949 def analyzeHTMLText(doc, resource, p, section, id):
 950     words = 0
 951     try:
 952         content = p.content
 953         words = words + addStringHTML(content, resource, id, section, 5)
 954     except:
 955         return -1
 956     return words
 957
 958 def analyzeHTMLPara(doc, resource, p, section, id):
 959     words = 0
 960     try:
 961         content = p.content
 962         words = words + addStringHTML(content, resource, id, section, 5)
 963     except:
 964         return -1
 965     return words
 966
 967 def analyzeHTMLPre(doc, resource, p, section, id):
 968     words = 0
 969     try:
 970         content = p.content
 971         words = words + addStringHTML(content, resource, id, section, 5)
 972     except:
 973         return -1
 974     return words
 975
 976 def analyzeHTML(doc, resource, p, section, id):
 977     words = 0
 978     try:
 979         content = p.content
 980         words = words + addStringHTML(content, resource, id, section, 5)
 981     except:
 982         return -1
 983     return words
 984
 985 def analyzeHTML(doc, resource):
 986     para = 0;
 987     ctxt = doc.xpathNewContext()
 988     try:
 989         res = ctxt.xpathEval("//head/title")
 990         title = res[0].content
 991     except:
 992         title = "Page %s" % (resource)
 993     addPage(resource, title)
 994     try:
 995         items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
 996         section = title
 997         id = ""
 998         for item in items:
 999             if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000                 section = item.content
1001                 if item.prop("id"):
1002                     id = item.prop("id")
1003                 elif item.prop("name"):
1004                     id = item.prop("name")
1005             elif item.type == 'text':
1006                 analyzeHTMLText(doc, resource, item, section, id)
1007                 para = para + 1
1008             elif item.name == 'p':
1009                 analyzeHTMLPara(doc, resource, item, section, id)
1010                 para = para + 1
1011             elif item.name == 'pre':
1012                 analyzeHTMLPre(doc, resource, item, section, id)
1013                 para = para + 1
1014             else:
1015                 print "Page %s, unexpected %s element" % (resource, item.name)
1016     except:
1017         print "Page %s: problem analyzing" % (resource)
1018         print sys.exc_type, sys.exc_value
1019
1020     return para
1021
1022 def analyzeHTMLPages():
1023     ret = 0
1024     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025     for html in HTMLfiles:
1026         if html[0:3] == "API":
1027             continue
1028         if html == "xml.html":
1029             continue
1030         try:
1031             doc = libxml2.parseFile(html)
1032         except:
1033             doc = libxml2.htmlParseFile(html, None)
1034         try:
1035             res = analyzeHTML(doc, html)
1036             print "Parsed %s : %d paragraphs" % (html, res)
1037             ret = ret + 1
1038         except:
1039             print "could not parse %s" % (html)
1040     return ret
1041
1042 #########################################################################
1043 #                                                                       #
1044 #                  Mail archives parsing and analysis                   #
1045 #                                                                       #
1046 #########################################################################
1047
1048 import time
1049
1050 def getXMLDateArchive(t = None):
1051     if t == None:
1052         t = time.time()
1053     T = time.gmtime(t)
1054     month = time.strftime("%B", T)
1055     year = T[0]
1056     url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057     return url
1058
1059 def scanXMLMsgArchive(url, title, force = 0):
1060     if url == None or title == None:
1061         return 0
1062
1063     ID = checkXMLMsgArchive(url)
1064     if force == 0 and ID != -1:
1065         return 0
1066
1067     if ID == -1:
1068         ID = addXMLMsgArchive(url, title)
1069         if ID == -1:
1070             return 0
1071
1072     try:
1073         print "Loading %s" % (url)
1074         doc = libxml2.htmlParseFile(url, None);
1075     except:
1076         doc = None
1077     if doc == None:
1078         print "Failed to parse %s" % (url)
1079         return 0
1080
1081     addStringArchive(title, ID, 20)
1082     ctxt = doc.xpathNewContext()
1083     texts = ctxt.xpathEval("//pre//text()")
1084     for text in texts:
1085         addStringArchive(text.content, ID, 5)
1086
1087     return 1
1088
1089 def scanXMLDateArchive(t = None, force = 0):
1090     global wordsDictArchive
1091
1092     wordsDictArchive = {}
1093
1094     url = getXMLDateArchive(t)
1095     print "loading %s" % (url)
1096     try:
1097         doc = libxml2.htmlParseFile(url, None);
1098     except:
1099         doc = None
1100     if doc == None:
1101         print "Failed to parse %s" % (url)
1102         return -1
1103     ctxt = doc.xpathNewContext()
1104     anchors = ctxt.xpathEval("//a[@href]")
1105     links = 0
1106     newmsg = 0
1107     for anchor in anchors:
1108         href = anchor.prop("href")
1109         if href == None or href[0:3] != "msg":
1110             continue
1111         try:
1112             links = links + 1
1113
1114             msg = libxml2.buildURI(href, url)
1115             title = anchor.content
1116             if title != None and title[0:4] == 'Re: ':
1117                 title = title[4:]
1118             if title != None and title[0:6] == '[xml] ':
1119                 title = title[6:]
1120             newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121
1122         except:
1123             pass
1124
1125     return newmsg
1126
1127
1128 #########################################################################
1129 #                                                                       #
1130 #          Main code: open the DB, the API XML and analyze it           #
1131 #                                                                       #
1132 #########################################################################
1133 def analyzeArchives(t = None, force = 0):
1134     global wordsDictArchive
1135
1136     ret = scanXMLDateArchive(t, force)
1137     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138
1139     i = 0
1140     skipped = 0
1141     for word in wordsDictArchive.keys():
1142         refs = wordsDictArchive[word]
1143         if refs  == None:
1144             skipped = skipped + 1
1145             continue;
1146         for id in refs.keys():
1147             relevance = refs[id]
1148             updateWordArchive(word, id, relevance)
1149             i = i + 1
1150
1151     print "Found %d associations in HTML pages" % (i)
1152
1153 def analyzeHTMLTop():
1154     global wordsDictHTML
1155
1156     ret = analyzeHTMLPages()
1157     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158
1159     i = 0
1160     skipped = 0
1161     for word in wordsDictHTML.keys():
1162         refs = wordsDictHTML[word]
1163         if refs  == None:
1164             skipped = skipped + 1
1165             continue;
1166         for resource in refs.keys():
1167             (relevance, id, section) = refs[resource]
1168             updateWordHTML(word, resource, section, id, relevance)
1169             i = i + 1
1170
1171     print "Found %d associations in HTML pages" % (i)
1172
1173 def analyzeAPITop():
1174     global wordsDict
1175     global API
1176
1177     try:
1178         doc = loadAPI(API)
1179         ret = analyzeAPI(doc)
1180         print "Analyzed %d blocs" % (ret)
1181         doc.freeDoc()
1182     except:
1183         print "Failed to parse and analyze %s" % (API)
1184         print sys.exc_type, sys.exc_value
1185         sys.exit(1)
1186
1187     print "Indexed %d words" % (len(wordsDict))
1188     i = 0
1189     skipped = 0
1190     for word in wordsDict.keys():
1191         refs = wordsDict[word]
1192         if refs  == None:
1193             skipped = skipped + 1
1194             continue;
1195         for (module, symbol) in refs.keys():
1196             updateWord(word, symbol, refs[(module, symbol)])
1197             i = i + 1
1198
1199     print "Found %d associations, skipped %d words" % (i, skipped)
1200
1201 def usage():
1202     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1203     sys.exit(1)
1204
1205 def main():
1206     try:
1207         openMySQL()
1208     except:
1209         print "Failed to open the database"
1210         print sys.exc_type, sys.exc_value
1211         sys.exit(1)
1212
1213     args = sys.argv[1:]
1214     force = 0
1215     if args:
1216         i = 0
1217         while i < len(args):
1218             if args[i] == '--force':
1219                 force = 1
1220             elif args[i] == '--archive':
1221                 analyzeArchives(None, force)
1222             elif args[i] == '--archive-year':
1223                 i = i + 1;
1224                 year = args[i]
1225                 months = ["January" , "February", "March", "April", "May",
1226                           "June", "July", "August", "September", "October",
1227                           "November", "December"];
1228                 for month in months:
1229                     try:
1230                         str = "%s-%s" % (year, month)
1231                         T = time.strptime(str, "%Y-%B")
1232                         t = time.mktime(T) + 3600 * 24 * 10;
1233                         analyzeArchives(t, force)
1234                     except:
1235                         print "Failed to index month archive:"
1236                         print sys.exc_type, sys.exc_value
1237             elif args[i] == '--archive-month':
1238                 i = i + 1;
1239                 month = args[i]
1240                 try:
1241                     T = time.strptime(month, "%Y-%B")
1242                     t = time.mktime(T) + 3600 * 24 * 10;
1243                     analyzeArchives(t, force)
1244                 except:
1245                     print "Failed to index month archive:"
1246                     print sys.exc_type, sys.exc_value
1247             elif args[i] == '--API':
1248                 analyzeAPITop()
1249             elif args[i] == '--docs':
1250                 analyzeHTMLTop()
1251             else:
1252                 usage()
1253             i = i + 1
1254     else:
1255         usage()
1256
1257 if __name__ == "__main__":
1258     main()