doc/index.py

   1 #!/usr/bin/python -u
   2 #
   3 # imports the API description and fills up a database with
   4 # name relevance to modules, functions or web pages
   5 #
   6 # Operation needed:
   7 # =================
   8 #
   9 # install mysqld, the python wrappers for mysql and libxml2, start mysqld
  10 # Change the root passwd of mysql:
  11 #    mysqladmin -u root password new_password
  12 # Create the new database xmlsoft
  13 #    mysqladmin -p create xmlsoft
  14 # Create a database user 'veillard' and give him passord access
  15 # change veillard and abcde with the right user name and passwd
  16 #    mysql -p
  17 #    password:
  18 #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
  19 #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
  20 #
  21 # As the user check the access:
  22 #    mysql -p xmlsoft
  23 #    Enter password:
  24 #    Welcome to the MySQL monitor....
  25 #    mysql> use xmlsoft
  26 #    Database changed
  27 #    mysql> quit
  28 #    Bye
  29 #
  30 # Then run the script in the doc subdir, it will create the XSLTsymbols and
  31 # word tables and populate them with informations extracted from
  32 # the libxml2-api.xml API description, and make then accessible read-only
  33 # by nobody@loaclhost the user expected to be Apache's one
  34 #
  35 # On the Apache configuration, make sure you have php support enabled
  36 #
  37
  38 import MySQLdb
  39 import libxml2
  40 import sys
  41 import string
  42 import os
  43
  44 #
  45 # We are not interested in parsing errors here
  46 #
  47 def callback(ctx, str):
  48     return
  49 libxml2.registerErrorHandler(callback, None)
  50
  51 #
  52 # The dictionary of tables required and the SQL command needed
  53 # to create them
  54 #
  55 TABLES={
  56   "XSLTsymbols" : """CREATE TABLE XSLTsymbols (
  57            name varchar(255) BINARY NOT NULL,
  58            module varchar(255) BINARY NOT NULL,
  59            type varchar(25) NOT NULL,
  60            descr varchar(255),
  61            UNIQUE KEY name (name),
  62            KEY module (module))""",
  63   "XSLTwords" : """CREATE TABLE XSLTwords (
  64            name varchar(50) BINARY NOT NULL,
  65            symbol varchar(255) BINARY NOT NULL,
  66            relevance int,
  67            KEY name (name),
  68            KEY symbol (symbol),
  69            UNIQUE KEY ID (name, symbol))""",
  70   "XSLTwordsHTML" : """CREATE TABLE XSLTwordsHTML (
  71            name varchar(50) BINARY NOT NULL,
  72            resource varchar(255) BINARY NOT NULL,
  73            section varchar(255),
  74            id varchar(50),
  75            relevance int,
  76            KEY name (name),
  77            KEY resource (resource),
  78            UNIQUE KEY ref (name, resource))""",
  79   "XSLTwordsArchive" : """CREATE TABLE XSLTwordsArchive (
  80            name varchar(50) BINARY NOT NULL,
  81            ID int(11) NOT NULL,
  82            relevance int,
  83            KEY name (name),
  84            UNIQUE KEY ref (name, ID))""",
  85   "XSLTpages" : """CREATE TABLE XSLTpages (
  86            resource varchar(255) BINARY NOT NULL,
  87            title varchar(255) BINARY NOT NULL,
  88            UNIQUE KEY name (resource))""",
  89   "archives" : """CREATE TABLE archives (
  90            ID int(11) NOT NULL auto_increment,
  91            resource varchar(255) BINARY NOT NULL,
  92            title varchar(255) BINARY NOT NULL,
  93            UNIQUE KEY id (ID,resource(255)),
  94            INDEX (ID),
  95            INDEX (resource))""",
  96   "Queries" : """CREATE TABLE Queries (
  97            ID int(11) NOT NULL auto_increment,
  98            Value varchar(50) NOT NULL,
  99            Count int(11) NOT NULL,
 100            UNIQUE KEY id (ID,Value(35)),
 101            INDEX (ID))""",
 102 }
 103
 104 #
 105 # The XML API description file to parse
 106 #
 107 API="libxslt-api.xml"
 108 DB=None
 109
 110 #########################################################################
 111 #                                                                       #
 112 #                  MySQL database interfaces                            #
 113 #                                                                       #
 114 #########################################################################
 115 def createTable(db, name):
 116     global TABLES
 117
 118     if db == None:
 119         return -1
 120     if name == None:
 121         return -1
 122     c = db.cursor()
 123
 124     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
 125     if ret == 1:
 126         print "Removed table %s" % (name)
 127     print "Creating table %s" % (name)
 128     try:
 129         ret = c.execute(TABLES[name])
 130     except:
 131         print "Failed to create table %s" % (name)
 132         return -1
 133     return ret
 134
 135 def checkTables(db):
 136     global TABLES
 137
 138     if db == None:
 139         return -1
 140     c = db.cursor()
 141     nbtables = c.execute("show tables")
 142     print "Found %d tables" % (nbtables)
 143     tables = {}
 144     i = 0
 145     while i < nbtables:
 146         l = c.fetchone()
 147         name = l[0]
 148         tables[name] = {}
 149         i = i + 1
 150
 151     for table in TABLES.keys():
 152         if not tables.has_key(table):
 153             print "table %s missing" % (table)
 154             createTable(db, table)
 155         try:
 156             ret = c.execute("SELECT count(*) from %s" % table);
 157             row = c.fetchone()
 158             print "Table %s contains %d records" % (table, row[0])
 159         except:
 160             print "Troubles with table %s : repairing" % (table)
 161             ret = c.execute("repair table %s" % table);
 162             print "repairing returned %d" % (ret)
 163             ret = c.execute("SELECT count(*) from %s" % table);
 164             row = c.fetchone()
 165             print "Table %s contains %d records" % (table, row[0])
 166     print "checkTables finished"
 167
 168     # make sure apache can access the tables read-only
 169     try:
 170         ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
 171         ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
 172     except:
 173         pass
 174     return 0
 175
 176 def openMySQL(db="xmlsoft", passwd=None):
 177     global DB
 178
 179     if passwd == None:
 180         try:
 181             passwd = os.environ["MySQL_PASS"]
 182         except:
 183             print "No password available, set environment MySQL_PASS"
 184             sys.exit(1)
 185
 186     DB = MySQLdb.connect(passwd=passwd, db=db)
 187     if DB == None:
 188         return -1
 189     ret = checkTables(DB)
 190     return ret
 191
 192 def updateWord(name, symbol, relevance):
 193     global DB
 194
 195     if DB == None:
 196         openMySQL()
 197     if DB == None:
 198         return -1
 199     if name == None:
 200         return -1
 201     if symbol == None:
 202         return -1
 203
 204     c = DB.cursor()
 205     try:
 206         ret = c.execute(
 207 """INSERT INTO XSLTwords (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
 208                 (name, symbol, relevance))
 209     except:
 210         try:
 211             ret = c.execute(
 212     """UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'""" %
 213                     (relevance, name, symbol))
 214         except:
 215             print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
 216             print "UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
 217             print sys.exc_type, sys.exc_value
 218             return -1
 219
 220     return ret
 221
 222 def updateSymbol(name, module, type, desc):
 223     global DB
 224
 225     updateWord(name, name, 50)
 226     if DB == None:
 227         openMySQL()
 228     if DB == None:
 229         return -1
 230     if name == None:
 231         return -1
 232     if module == None:
 233         return -1
 234     if type == None:
 235         return -1
 236
 237     try:
 238         desc = string.replace(desc, "'", " ")
 239         l = string.split(desc, ".")
 240         desc = l[0]
 241         desc = desc[0:99]
 242     except:
 243         desc = ""
 244
 245     c = DB.cursor()
 246     try:
 247         ret = c.execute(
 248 """INSERT INTO XSLTsymbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
 249                     (name, module, type, desc))
 250     except:
 251         try:
 252             ret = c.execute(
 253 """UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
 254                     (module, type, desc, name))
 255         except:
 256             print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 257             print """UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
 258             print sys.exc_type, sys.exc_value
 259             return -1
 260
 261     return ret
 262
 263 def addFunction(name, module, desc = ""):
 264     return updateSymbol(name, module, 'function', desc)
 265
 266 def addMacro(name, module, desc = ""):
 267     return updateSymbol(name, module, 'macro', desc)
 268
 269 def addEnum(name, module, desc = ""):
 270     return updateSymbol(name, module, 'enum', desc)
 271
 272 def addStruct(name, module, desc = ""):
 273     return updateSymbol(name, module, 'struct', desc)
 274
 275 def addConst(name, module, desc = ""):
 276     return updateSymbol(name, module, 'const', desc)
 277
 278 def addType(name, module, desc = ""):
 279     return updateSymbol(name, module, 'type', desc)
 280
 281 def addFunctype(name, module, desc = ""):
 282     return updateSymbol(name, module, 'functype', desc)
 283
 284 def addPage(resource, title):
 285     global DB
 286
 287     if DB == None:
 288         openMySQL()
 289     if DB == None:
 290         return -1
 291     if resource == None:
 292         return -1
 293
 294     c = DB.cursor()
 295     try:
 296         ret = c.execute(
 297             """INSERT INTO XSLTpages (resource, title) VALUES ('%s','%s')""" %
 298                     (resource, title))
 299     except:
 300         try:
 301             ret = c.execute(
 302                 """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" %
 303                     (title, resource))
 304         except:
 305             print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 306             print """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % (title, resource)
 307             print sys.exc_type, sys.exc_value
 308             return -1
 309
 310     return ret
 311
 312 def updateWordHTML(name, resource, desc, id, relevance):
 313     global DB
 314
 315     if DB == None:
 316         openMySQL()
 317     if DB == None:
 318         return -1
 319     if name == None:
 320         return -1
 321     if resource == None:
 322         return -1
 323     if id == None:
 324         id = ""
 325     if desc == None:
 326         desc = ""
 327     else:
 328         try:
 329             desc = string.replace(desc, "'", " ")
 330             desc = desc[0:99]
 331         except:
 332             desc = ""
 333
 334     c = DB.cursor()
 335     try:
 336         ret = c.execute(
 337 """INSERT INTO XSLTwordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
 338                     (name, resource, desc, id, relevance))
 339     except:
 340         try:
 341             ret = c.execute(
 342 """UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
 343                     (desc, id, relevance, name, resource))
 344         except:
 345             print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
 346             print """UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
 347             print sys.exc_type, sys.exc_value
 348             return -1
 349
 350     return ret
 351
 352 def checkXMLMsgArchive(url):
 353     global DB
 354
 355     if DB == None:
 356         openMySQL()
 357     if DB == None:
 358         return -1
 359     if url == None:
 360         return -1
 361
 362     c = DB.cursor()
 363     try:
 364         ret = c.execute(
 365             """SELECT ID FROM archives WHERE resource='%s'""" % (url))
 366         row = c.fetchone()
 367         if row == None:
 368             return -1
 369     except:
 370         return -1
 371
 372     return row[0]
 373
 374 def addXMLMsgArchive(url, title):
 375     global DB
 376
 377     if DB == None:
 378         openMySQL()
 379     if DB == None:
 380         return -1
 381     if url == None:
 382         return -1
 383     if title == None:
 384         title = ""
 385     else:
 386         title = string.replace(title, "'", " ")
 387         title = title[0:99]
 388
 389     c = DB.cursor()
 390     try:
 391         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
 392         ret = c.execute(cmd)
 393         cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
 394         ret = c.execute(cmd)
 395         row = c.fetchone()
 396         if row == None:
 397             print "addXMLMsgArchive failed to get the ID: %s" % (url)
 398             return -1
 399     except:
 400         print "addXMLMsgArchive failed command: %s" % (cmd)
 401         return -1
 402
 403     return((int)(row[0]))
 404
 405 def updateWordArchive(name, id, relevance):
 406     global DB
 407
 408     if DB == None:
 409         openMySQL()
 410     if DB == None:
 411         return -1
 412     if name == None:
 413         return -1
 414     if id == None:
 415         return -1
 416
 417     c = DB.cursor()
 418     try:
 419         ret = c.execute(
 420 """INSERT INTO XSLTwordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
 421                     (name, id, relevance))
 422     except:
 423         try:
 424             ret = c.execute(
 425 """UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
 426                     (relevance, name, id))
 427         except:
 428             print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
 429             print """UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
 430             print sys.exc_type, sys.exc_value
 431             return -1
 432
 433     return ret
 434
 435 #########################################################################
 436 #                                                                       #
 437 #                  Word dictionary and analysis routines                #
 438 #                                                                       #
 439 #########################################################################
 440
 441 #
 442 # top 100 english word without the one len < 3 + own set
 443 #
 444 dropWords = {
 445     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
 446     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
 447     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
 448     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
 449     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
 450     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
 451     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
 452     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
 453     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
 454     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
 455     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
 456     'down':0,
 457     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
 458 }
 459
 460 wordsDict = {}
 461 wordsDictHTML = {}
 462 wordsDictArchive = {}
 463
 464 def cleanupWordsString(str):
 465     str = string.replace(str, ".", " ")
 466     str = string.replace(str, "!", " ")
 467     str = string.replace(str, "?", " ")
 468     str = string.replace(str, ",", " ")
 469     str = string.replace(str, "'", " ")
 470     str = string.replace(str, '"', " ")
 471     str = string.replace(str, ";", " ")
 472     str = string.replace(str, "(", " ")
 473     str = string.replace(str, ")", " ")
 474     str = string.replace(str, "{", " ")
 475     str = string.replace(str, "}", " ")
 476     str = string.replace(str, "<", " ")
 477     str = string.replace(str, ">", " ")
 478     str = string.replace(str, "=", " ")
 479     str = string.replace(str, "/", " ")
 480     str = string.replace(str, "*", " ")
 481     str = string.replace(str, ":", " ")
 482     str = string.replace(str, "#", " ")
 483     str = string.replace(str, "\\", " ")
 484     str = string.replace(str, "\n", " ")
 485     str = string.replace(str, "\r", " ")
 486     str = string.replace(str, "\xc2", " ")
 487     str = string.replace(str, "\xa0", " ")
 488     return str
 489
 490 def cleanupDescrString(str):
 491     str = string.replace(str, "'", " ")
 492     str = string.replace(str, "\n", " ")
 493     str = string.replace(str, "\r", " ")
 494     str = string.replace(str, "\xc2", " ")
 495     str = string.replace(str, "\xa0", " ")
 496     l = string.split(str)
 497     str = string.join(str)
 498     return str
 499
 500 def splitIdentifier(str):
 501     ret = []
 502     while str != "":
 503         cur = string.lower(str[0])
 504         str = str[1:]
 505         if ((cur < 'a') or (cur > 'z')):
 506             continue
 507         while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
 508             cur = cur + string.lower(str[0])
 509             str = str[1:]
 510         while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
 511             cur = cur + str[0]
 512             str = str[1:]
 513         while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
 514             str = str[1:]
 515         ret.append(cur)
 516     return ret
 517
 518 def addWord(word, module, symbol, relevance):
 519     global wordsDict
 520
 521     if word == None or len(word) < 3:
 522         return -1
 523     if module == None or symbol == None:
 524         return -1
 525     if dropWords.has_key(word):
 526         return 0
 527     if ord(word[0]) > 0x80:
 528         return 0
 529
 530     if wordsDict.has_key(word):
 531         d = wordsDict[word]
 532         if d == None:
 533             return 0
 534         if len(d) > 500:
 535             wordsDict[word] = None
 536             return 0
 537         try:
 538             relevance = relevance + d[(module, symbol)]
 539         except:
 540             pass
 541     else:
 542         wordsDict[word] = {}
 543     wordsDict[word][(module, symbol)] = relevance
 544     return relevance
 545
 546 def addString(str, module, symbol, relevance):
 547     if str == None or len(str) < 3:
 548         return -1
 549     ret = 0
 550     str = cleanupWordsString(str)
 551     l = string.split(str)
 552     for word in l:
 553         if len(word) > 2:
 554             ret = ret + addWord(word, module, symbol, 5)
 555
 556     return ret
 557
 558 def addWordHTML(word, resource, id, section, relevance):
 559     global wordsDictHTML
 560
 561     if word == None or len(word) < 3:
 562         return -1
 563     if resource == None or section == None:
 564         return -1
 565     if dropWords.has_key(word):
 566         return 0
 567     if ord(word[0]) > 0x80:
 568         return 0
 569
 570     section = cleanupDescrString(section)
 571
 572     if wordsDictHTML.has_key(word):
 573         d = wordsDictHTML[word]
 574         if d == None:
 575             print "skipped %s" % (word)
 576             return 0
 577         try:
 578             (r,i,s) = d[resource]
 579             if i != None:
 580                 id = i
 581             if s != None:
 582                 section = s
 583             relevance = relevance + r
 584         except:
 585             pass
 586     else:
 587         wordsDictHTML[word] = {}
 588     d = wordsDictHTML[word];
 589     d[resource] = (relevance, id, section)
 590     return relevance
 591
 592 def addStringHTML(str, resource, id, section, relevance):
 593     if str == None or len(str) < 3:
 594         return -1
 595     ret = 0
 596     str = cleanupWordsString(str)
 597     l = string.split(str)
 598     for word in l:
 599         if len(word) > 2:
 600             try:
 601                 r = addWordHTML(word, resource, id, section, relevance)
 602                 if r < 0:
 603                     print "addWordHTML failed: %s %s" % (word, resource)
 604                 ret = ret + r
 605             except:
 606                 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
 607                 print sys.exc_type, sys.exc_value
 608
 609     return ret
 610
 611 def addWordArchive(word, id, relevance):
 612     global wordsDictArchive
 613
 614     if word == None or len(word) < 3:
 615         return -1
 616     if id == None or id == -1:
 617         return -1
 618     if dropWords.has_key(word):
 619         return 0
 620     if ord(word[0]) > 0x80:
 621         return 0
 622
 623     if wordsDictArchive.has_key(word):
 624         d = wordsDictArchive[word]
 625         if d == None:
 626             print "skipped %s" % (word)
 627             return 0
 628         try:
 629             r = d[id]
 630             relevance = relevance + r
 631         except:
 632             pass
 633     else:
 634         wordsDictArchive[word] = {}
 635     d = wordsDictArchive[word];
 636     d[id] = relevance
 637     return relevance
 638
 639 def addStringArchive(str, id, relevance):
 640     if str == None or len(str) < 3:
 641         return -1
 642     ret = 0
 643     str = cleanupWordsString(str)
 644     l = string.split(str)
 645     for word in l:
 646         i = len(word)
 647         if i > 2:
 648             try:
 649                 r = addWordArchive(word, id, relevance)
 650                 if r < 0:
 651                     print "addWordArchive failed: %s %s" % (word, id)
 652                 else:
 653                     ret = ret + r
 654             except:
 655                 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
 656                 print sys.exc_type, sys.exc_value
 657     return ret
 658
 659 #########################################################################
 660 #                                                                       #
 661 #                  XML API description analysis                         #
 662 #                                                                       #
 663 #########################################################################
 664
 665 def loadAPI(filename):
 666     doc = libxml2.parseFile(filename)
 667     print "loaded %s" % (filename)
 668     return doc
 669
 670 def foundExport(file, symbol):
 671     if file == None:
 672         return 0
 673     if symbol == None:
 674         return 0
 675     addFunction(symbol, file)
 676     l = splitIdentifier(symbol)
 677     for word in l:
 678         addWord(word, file, symbol, 10)
 679     return 1
 680
 681 def analyzeAPIFile(top):
 682     count = 0
 683     name = top.prop("name")
 684     cur = top.children
 685     while cur != None:
 686         if cur.type == 'text':
 687             cur = cur.next
 688             continue
 689         if cur.name == "exports":
 690             count = count + foundExport(name, cur.prop("symbol"))
 691         else:
 692             print "unexpected element %s in API doc <file name='%s'>" % (name)
 693         cur = cur.next
 694     return count
 695
 696 def analyzeAPIFiles(top):
 697     count = 0
 698     cur = top.children
 699
 700     while cur != None:
 701         if cur.type == 'text':
 702             cur = cur.next
 703             continue
 704         if cur.name == "file":
 705             count = count + analyzeAPIFile(cur)
 706         else:
 707             print "unexpected element %s in API doc <files>" % (cur.name)
 708         cur = cur.next
 709     return count
 710
 711 def analyzeAPIEnum(top):
 712     file = top.prop("file")
 713     if file == None:
 714         return 0
 715     symbol = top.prop("name")
 716     if symbol == None:
 717         return 0
 718
 719     addEnum(symbol, file)
 720     l = splitIdentifier(symbol)
 721     for word in l:
 722         addWord(word, file, symbol, 10)
 723
 724     return 1
 725
 726 def analyzeAPIConst(top):
 727     file = top.prop("file")
 728     if file == None:
 729         return 0
 730     symbol = top.prop("name")
 731     if symbol == None:
 732         return 0
 733
 734     addConst(symbol, file)
 735     l = splitIdentifier(symbol)
 736     for word in l:
 737         addWord(word, file, symbol, 10)
 738
 739     return 1
 740
 741 def analyzeAPIType(top):
 742     file = top.prop("file")
 743     if file == None:
 744         return 0
 745     symbol = top.prop("name")
 746     if symbol == None:
 747         return 0
 748
 749     addType(symbol, file)
 750     l = splitIdentifier(symbol)
 751     for word in l:
 752         addWord(word, file, symbol, 10)
 753     return 1
 754
 755 def analyzeAPIFunctype(top):
 756     file = top.prop("file")
 757     if file == None:
 758         return 0
 759     symbol = top.prop("name")
 760     if symbol == None:
 761         return 0
 762
 763     addFunctype(symbol, file)
 764     l = splitIdentifier(symbol)
 765     for word in l:
 766         addWord(word, file, symbol, 10)
 767     return 1
 768
 769 def analyzeAPIStruct(top):
 770     file = top.prop("file")
 771     if file == None:
 772         return 0
 773     symbol = top.prop("name")
 774     if symbol == None:
 775         return 0
 776
 777     addStruct(symbol, file)
 778     l = splitIdentifier(symbol)
 779     for word in l:
 780         addWord(word, file, symbol, 10)
 781
 782     info = top.prop("info")
 783     if info != None:
 784         info = string.replace(info, "'", " ")
 785         info = string.strip(info)
 786         l = string.split(info)
 787         for word in l:
 788             if len(word) > 2:
 789                 addWord(word, file, symbol, 5)
 790     return 1
 791
 792 def analyzeAPIMacro(top):
 793     file = top.prop("file")
 794     if file == None:
 795         return 0
 796     symbol = top.prop("name")
 797     if symbol == None:
 798         return 0
 799     symbol = string.replace(symbol, "'", " ")
 800     symbol = string.strip(symbol)
 801
 802     info = None
 803     cur = top.children
 804     while cur != None:
 805         if cur.type == 'text':
 806             cur = cur.next
 807             continue
 808         if cur.name == "info":
 809             info = cur.content
 810             break
 811         cur = cur.next
 812
 813     l = splitIdentifier(symbol)
 814     for word in l:
 815         addWord(word, file, symbol, 10)
 816
 817     if info == None:
 818         addMacro(symbol, file)
 819         print "Macro %s description has no <info>" % (symbol)
 820         return 0
 821
 822     info = string.replace(info, "'", " ")
 823     info = string.strip(info)
 824     addMacro(symbol, file, info)
 825     l = string.split(info)
 826     for word in l:
 827         if len(word) > 2:
 828             addWord(word, file, symbol, 5)
 829     return 1
 830
 831 def analyzeAPIFunction(top):
 832     file = top.prop("file")
 833     if file == None:
 834         return 0
 835     symbol = top.prop("name")
 836     if symbol == None:
 837         return 0
 838
 839     symbol = string.replace(symbol, "'", " ")
 840     symbol = string.strip(symbol)
 841     info = None
 842     cur = top.children
 843     while cur != None:
 844         if cur.type == 'text':
 845             cur = cur.next
 846             continue
 847         if cur.name == "info":
 848             info = cur.content
 849         elif cur.name == "return":
 850             rinfo = cur.prop("info")
 851             if rinfo != None:
 852                 rinfo = string.replace(rinfo, "'", " ")
 853                 rinfo = string.strip(rinfo)
 854                 addString(rinfo, file, symbol, 7)
 855         elif cur.name == "arg":
 856             ainfo = cur.prop("info")
 857             if ainfo != None:
 858                 ainfo = string.replace(ainfo, "'", " ")
 859                 ainfo = string.strip(ainfo)
 860                 addString(ainfo, file, symbol, 5)
 861             name = cur.prop("name")
 862             if name != None:
 863                 name = string.replace(name, "'", " ")
 864                 name = string.strip(name)
 865                 addWord(name, file, symbol, 7)
 866         cur = cur.next
 867     if info == None:
 868         print "Function %s description has no <info>" % (symbol)
 869         addFunction(symbol, file, "")
 870     else:
 871         info = string.replace(info, "'", " ")
 872         info = string.strip(info)
 873         addFunction(symbol, file, info)
 874         addString(info, file, symbol, 5)
 875
 876     l = splitIdentifier(symbol)
 877     for word in l:
 878         addWord(word, file, symbol, 10)
 879
 880     return 1
 881
 882 def analyzeAPISymbols(top):
 883     count = 0
 884     cur = top.children
 885
 886     while cur != None:
 887         if cur.type == 'text':
 888             cur = cur.next
 889             continue
 890         if cur.name == "macro":
 891             count = count + analyzeAPIMacro(cur)
 892         elif cur.name == "function":
 893             count = count + analyzeAPIFunction(cur)
 894         elif cur.name == "const":
 895             count = count + analyzeAPIConst(cur)
 896         elif cur.name == "typedef":
 897             count = count + analyzeAPIType(cur)
 898         elif cur.name == "struct":
 899             count = count + analyzeAPIStruct(cur)
 900         elif cur.name == "enum":
 901             count = count + analyzeAPIEnum(cur)
 902         elif cur.name == "functype":
 903             count = count + analyzeAPIFunctype(cur)
 904         else:
 905             print "unexpected element %s in API doc <files>" % (cur.name)
 906         cur = cur.next
 907     return count
 908
 909 def analyzeAPI(doc):
 910     count = 0
 911     if doc == None:
 912         return -1
 913     root = doc.getRootElement()
 914     if root.name != "api":
 915         print "Unexpected root name"
 916         return -1
 917     cur = root.children
 918     while cur != None:
 919         if cur.type == 'text':
 920             cur = cur.next
 921             continue
 922         if cur.name == "files":
 923             pass
 924 #           count = count + analyzeAPIFiles(cur)
 925         elif cur.name == "symbols":
 926             count = count + analyzeAPISymbols(cur)
 927         else:
 928             print "unexpected element %s in API doc" % (cur.name)
 929         cur = cur.next
 930     return count
 931
 932 #########################################################################
 933 #                                                                       #
 934 #                  Web pages parsing and analysis                       #
 935 #                                                                       #
 936 #########################################################################
 937
 938 import glob
 939
 940 def analyzeHTMLText(doc, resource, p, section, id):
 941     words = 0
 942     try:
 943         content = p.content
 944         words = words + addStringHTML(content, resource, id, section, 5)
 945     except:
 946         return -1
 947     return words
 948
 949 def analyzeHTMLPara(doc, resource, p, section, id):
 950     words = 0
 951     try:
 952         content = p.content
 953         words = words + addStringHTML(content, resource, id, section, 5)
 954     except:
 955         return -1
 956     return words
 957
 958 def analyzeHTMLPre(doc, resource, p, section, id):
 959     words = 0
 960     try:
 961         content = p.content
 962         words = words + addStringHTML(content, resource, id, section, 5)
 963     except:
 964         return -1
 965     return words
 966
 967 def analyzeHTML(doc, resource, p, section, id):
 968     words = 0
 969     try:
 970         content = p.content
 971         words = words + addStringHTML(content, resource, id, section, 5)
 972     except:
 973         return -1
 974     return words
 975
 976 def analyzeHTML(doc, resource):
 977     para = 0;
 978     ctxt = doc.xpathNewContext()
 979     try:
 980         res = ctxt.xpathEval("//head/title")
 981         title = res[0].content
 982     except:
 983         title = "Page %s" % (resource)
 984     addPage(resource, title)
 985     try:
 986         items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
 987         section = title
 988         id = ""
 989         for item in items:
 990             if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
 991                 section = item.content
 992                 if item.prop("id"):
 993                     id = item.prop("id")
 994                 elif item.prop("name"):
 995                     id = item.prop("name")
 996             elif item.type == 'text':
 997                 analyzeHTMLText(doc, resource, item, section, id)
 998                 para = para + 1
 999             elif item.name == 'p':
1000                 analyzeHTMLPara(doc, resource, item, section, id)
1001                 para = para + 1
1002             elif item.name == 'pre':
1003                 analyzeHTMLPre(doc, resource, item, section, id)
1004                 para = para + 1
1005             else:
1006                 print "Page %s, unexpected %s element" % (resource, item.name)
1007     except:
1008         print "Page %s: problem analyzing" % (resource)
1009         print sys.exc_type, sys.exc_value
1010
1011     return para
1012
1013 def analyzeHTMLPages():
1014     ret = 0
1015     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1016     for html in HTMLfiles:
1017         if html[0:3] == "API":
1018             continue
1019         if html == "xslt.html":
1020             continue
1021         try:
1022             doc = libxml2.htmlParseFile(html, None)
1023             res = analyzeHTML(doc, html)
1024             print "Parsed %s : %d paragraphs" % (html, res)
1025             ret = ret + 1
1026         except:
1027             print "could not parse %s" % (html)
1028     return ret
1029
1030 #########################################################################
1031 #                                                                       #
1032 #                  Mail archives parsing and analysis                   #
1033 #                                                                       #
1034 #########################################################################
1035
1036 import time
1037
1038 def getXMLDateArchive(t = None):
1039     if t == None:
1040         t = time.time()
1041     T = time.gmtime(t)
1042     month = time.strftime("%B", T)
1043     year = T[0]
1044     url = "http://mail.gnome.org/archives/xslt/%d-%s/date.html" % (year, month)
1045     return url
1046
1047 def scanXMLMsgArchive(url, title, force = 0):
1048     if url == None or title == None:
1049         return 0
1050
1051     ID = checkXMLMsgArchive(url)
1052     if force == 0 and ID != -1:
1053         return 0
1054
1055     if ID == -1:
1056         ID = addXMLMsgArchive(url, title)
1057         if ID == -1:
1058             return 0
1059
1060     try:
1061         print "Loading %s" % (url)
1062         doc = libxml2.htmlParseFile(url, None);
1063     except:
1064         doc = None
1065     if doc == None:
1066         print "Failed to parse %s" % (url)
1067         return 0
1068
1069     addStringArchive(title, ID, 20)
1070     ctxt = doc.xpathNewContext()
1071     texts = ctxt.xpathEval("//pre//text()")
1072     for text in texts:
1073         addStringArchive(text.content, ID, 5)
1074
1075     return 1
1076
1077 def scanXMLDateArchive(t = None, force = 0):
1078     global wordsDictArchive
1079
1080     wordsDictArchive = {}
1081
1082     url = getXMLDateArchive(t)
1083     print "loading %s" % (url)
1084     try:
1085         doc = libxml2.htmlParseFile(url, None);
1086     except:
1087         doc = None
1088     if doc == None:
1089         print "Failed to parse %s" % (url)
1090         return -1
1091     ctxt = doc.xpathNewContext()
1092     anchors = ctxt.xpathEval("//a[@href]")
1093     links = 0
1094     newmsg = 0
1095     for anchor in anchors:
1096         href = anchor.prop("href")
1097         if href == None or href[0:3] != "msg":
1098             continue
1099         try:
1100             links = links + 1
1101
1102             msg = libxml2.buildURI(href, url)
1103             title = anchor.content
1104             if title != None and title[0:4] == 'Re: ':
1105                 title = title[4:]
1106             if title != None and title[0:6] == '[xml] ':
1107                 title = title[6:]
1108             if title != None and title[0:7] == '[xslt] ':
1109                 title = title[7:]
1110             newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1111
1112         except:
1113             pass
1114
1115     return newmsg
1116
1117
1118 #########################################################################
1119 #                                                                       #
1120 #          Main code: open the DB, the API XML and analyze it           #
1121 #                                                                       #
1122 #########################################################################
1123 try:
1124     openMySQL()
1125 except:
1126     print "Failed to open the database"
1127     print sys.exc_type, sys.exc_value
1128     sys.exit(1)
1129
1130 def analyzeArchives(t = None, force = 0):
1131     global wordsDictArchive
1132
1133     ret = scanXMLDateArchive(t, force)
1134     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1135
1136     i = 0
1137     skipped = 0
1138     for word in wordsDictArchive.keys():
1139         refs = wordsDictArchive[word]
1140         if refs  == None:
1141             skipped = skipped + 1
1142             continue;
1143         for id in refs.keys():
1144             relevance = refs[id]
1145             updateWordArchive(word, id, relevance)
1146             i = i + 1
1147
1148     print "Found %d associations in HTML pages" % (i)
1149
1150 def analyzeHTMLTop():
1151     global wordsDictHTML
1152
1153     ret = analyzeHTMLPages()
1154     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1155
1156     i = 0
1157     skipped = 0
1158     for word in wordsDictHTML.keys():
1159         refs = wordsDictHTML[word]
1160         if refs  == None:
1161             skipped = skipped + 1
1162             continue;
1163         for resource in refs.keys():
1164             (relevance, id, section) = refs[resource]
1165             updateWordHTML(word, resource, section, id, relevance)
1166             i = i + 1
1167
1168     print "Found %d associations in HTML pages" % (i)
1169
1170 def analyzeAPITop():
1171     global wordsDict
1172     global API
1173
1174     try:
1175         doc = loadAPI(API)
1176         ret = analyzeAPI(doc)
1177         print "Analyzed %d blocs" % (ret)
1178         doc.freeDoc()
1179     except:
1180         print "Failed to parse and analyze %s" % (API)
1181         print sys.exc_type, sys.exc_value
1182         sys.exit(1)
1183
1184     print "Indexed %d words" % (len(wordsDict))
1185     i = 0
1186     skipped = 0
1187     for word in wordsDict.keys():
1188         refs = wordsDict[word]
1189         if refs  == None:
1190             skipped = skipped + 1
1191             continue;
1192         for (module, symbol) in refs.keys():
1193             updateWord(word, symbol, refs[(module, symbol)])
1194             i = i + 1
1195
1196     print "Found %d associations, skipped %d words" % (i, skipped)
1197
1198 def usage():
1199     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1200     sys.exit(1)
1201
1202 def main():
1203     args = sys.argv[1:]
1204     force = 0
1205     if args:
1206         i = 0
1207         while i < len(args):
1208             if args[i] == '--force':
1209                 force = 1
1210             elif args[i] == '--archive':
1211                 analyzeArchives(None, force)
1212             elif args[i] == '--archive-year':
1213                 i = i + 1;
1214                 year = args[i]
1215                 months = ["January" , "February", "March", "April", "May",
1216                           "June", "July", "August", "September", "October",
1217                           "November", "December"];
1218                 for month in months:
1219                     try:
1220                         str = "%s-%s" % (year, month)
1221                         T = time.strptime(str, "%Y-%B")
1222                         t = time.mktime(T) + 3600 * 24 * 10;
1223                         analyzeArchives(t, force)
1224                     except:
1225                         print "Failed to index month archive:"
1226                         print sys.exc_type, sys.exc_value
1227             elif args[i] == '--archive-month':
1228                 i = i + 1;
1229                 month = args[i]
1230                 try:
1231                     T = time.strptime(month, "%Y-%B")
1232                     t = time.mktime(T) + 3600 * 24 * 10;
1233                     analyzeArchives(t, force)
1234                 except:
1235                     print "Failed to index month archive:"
1236                     print sys.exc_type, sys.exc_value
1237             elif args[i] == '--API':
1238                 analyzeAPITop()
1239             elif args[i] == '--docs':
1240                 analyzeHTMLTop()
1241             else:
1242                 usage()
1243             i = i + 1
1244     else:
1245         usage()
1246
1247 if __name__ == "__main__":
1248     main()