/ libxml2 / doc / index.py
index.py
   1  #!/usr/bin/python -u
   2  #
   3  # imports the API description and fills up a database with
   4  # name relevance to modules, functions or web pages
   5  #
   6  # Operation needed:
   7  # =================
   8  #
   9  # install mysqld, the python wrappers for mysql and libxml2, start mysqld
  10  # Change the root passwd of mysql:
  11  #    mysqladmin -u root password new_password
  12  # Create the new database xmlsoft
  13  #    mysqladmin -p create xmlsoft
  14  # Create a database user 'veillard' and give him passord access
  15  # change veillard and abcde with the right user name and passwd
  16  #    mysql -p
  17  #    password:
  18  #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
  19  #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
  20  #
  21  # As the user check the access:
  22  #    mysql -p xmlsoft
  23  #    Enter password:
  24  #    Welcome to the MySQL monitor....
  25  #    mysql> use xmlsoft
  26  #    Database changed
  27  #    mysql> quit
  28  #    Bye
  29  #
  30  # Then run the script in the doc subdir, it will create the symbols and
  31  # word tables and populate them with informations extracted from 
  32  # the libxml2-api.xml API description, and make then accessible read-only
  33  # by nobody@loaclhost the user expected to be Apache's one
  34  #
  35  # On the Apache configuration, make sure you have php support enabled
  36  #
  37  
  38  import MySQLdb
  39  import libxml2
  40  import sys
  41  import string
  42  import os
  43  
  44  #
  45  # We are not interested in parsing errors here
  46  #
  47  def callback(ctx, str):
  48      return
  49  libxml2.registerErrorHandler(callback, None)
  50  
  51  #
  52  # The dictionary of tables required and the SQL command needed
  53  # to create them
  54  #
  55  TABLES={
  56    "symbols" : """CREATE TABLE symbols (
  57             name varchar(255) BINARY NOT NULL,
  58  	   module varchar(255) BINARY NOT NULL,
  59             type varchar(25) NOT NULL,
  60  	   descr varchar(255),
  61  	   UNIQUE KEY name (name),
  62  	   KEY module (module))""",
  63    "words" : """CREATE TABLE words (
  64             name varchar(50) BINARY NOT NULL,
  65  	   symbol varchar(255) BINARY NOT NULL,
  66             relevance int,
  67  	   KEY name (name),
  68  	   KEY symbol (symbol),
  69  	   UNIQUE KEY ID (name, symbol))""",
  70    "wordsHTML" : """CREATE TABLE wordsHTML (
  71             name varchar(50) BINARY NOT NULL,
  72  	   resource varchar(255) BINARY NOT NULL,
  73  	   section varchar(255),
  74  	   id varchar(50),
  75             relevance int,
  76  	   KEY name (name),
  77  	   KEY resource (resource),
  78  	   UNIQUE KEY ref (name, resource))""",
  79    "wordsArchive" : """CREATE TABLE wordsArchive (
  80             name varchar(50) BINARY NOT NULL,
  81  	   ID int(11) NOT NULL,
  82             relevance int,
  83  	   KEY name (name),
  84  	   UNIQUE KEY ref (name, ID))""",
  85    "pages" : """CREATE TABLE pages (
  86             resource varchar(255) BINARY NOT NULL,
  87  	   title varchar(255) BINARY NOT NULL,
  88  	   UNIQUE KEY name (resource))""",
  89    "archives" : """CREATE TABLE archives (
  90             ID int(11) NOT NULL auto_increment,
  91             resource varchar(255) BINARY NOT NULL,
  92  	   title varchar(255) BINARY NOT NULL,
  93  	   UNIQUE KEY id (ID,resource(255)),
  94  	   INDEX (ID),
  95  	   INDEX (resource))""",
  96    "Queries" : """CREATE TABLE Queries (
  97             ID int(11) NOT NULL auto_increment,
  98  	   Value varchar(50) NOT NULL,
  99  	   Count int(11) NOT NULL,
 100  	   UNIQUE KEY id (ID,Value(35)),
 101  	   INDEX (ID))""",
 102    "AllQueries" : """CREATE TABLE AllQueries (
 103             ID int(11) NOT NULL auto_increment,
 104  	   Value varchar(50) NOT NULL,
 105  	   Count int(11) NOT NULL,
 106  	   UNIQUE KEY id (ID,Value(35)),
 107  	   INDEX (ID))""",
 108  }
 109  
 110  #
 111  # The XML API description file to parse
 112  #
 113  API="libxml2-api.xml"
 114  DB=None
 115  
 116  #########################################################################
 117  #									#
 118  #                  MySQL database interfaces				#
 119  #									#
 120  #########################################################################
 121  def createTable(db, name):
 122      global TABLES
 123  
 124      if db == None:
 125          return -1
 126      if name == None:
 127          return -1
 128      c = db.cursor()
 129  
 130      ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
 131      if ret == 1:
 132          print "Removed table %s" % (name)
 133      print "Creating table %s" % (name)
 134      try:
 135          ret = c.execute(TABLES[name])
 136      except:
 137          print "Failed to create table %s" % (name)
 138  	return -1
 139      return ret
 140  
 141  def checkTables(db, verbose = 1):
 142      global TABLES
 143  
 144      if db == None:
 145          return -1
 146      c = db.cursor()
 147      nbtables = c.execute("show tables")
 148      if verbose:
 149  	print "Found %d tables" % (nbtables)
 150      tables = {}
 151      i = 0
 152      while i < nbtables:
 153          l = c.fetchone()
 154  	name = l[0]
 155  	tables[name] = {}
 156          i = i + 1
 157  
 158      for table in TABLES.keys():
 159          if not tables.has_key(table):
 160  	    print "table %s missing" % (table)
 161  	    createTable(db, table)
 162  	try:
 163  	    ret = c.execute("SELECT count(*) from %s" % table);
 164  	    row = c.fetchone()
 165  	    if verbose:
 166  		print "Table %s contains %d records" % (table, row[0])
 167  	except:
 168  	    print "Troubles with table %s : repairing" % (table)
 169  	    ret = c.execute("repair table %s" % table);
 170  	    print "repairing returned %d" % (ret)
 171  	    ret = c.execute("SELECT count(*) from %s" % table);
 172  	    row = c.fetchone()
 173  	    print "Table %s contains %d records" % (table, row[0])
 174      if verbose:
 175  	print "checkTables finished"
 176  
 177      # make sure apache can access the tables read-only
 178      try:
 179  	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
 180  	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
 181      except:
 182          pass
 183      return 0
 184      
 185  def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
 186      global DB
 187  
 188      if passwd == None:
 189          try:
 190  	    passwd = os.environ["MySQL_PASS"]
 191  	except:
 192  	    print "No password available, set environment MySQL_PASS"
 193  	    sys.exit(1)
 194  
 195      DB = MySQLdb.connect(passwd=passwd, db=db)
 196      if DB == None:
 197          return -1
 198      ret = checkTables(DB, verbose)
 199      return ret
 200  
 201  def updateWord(name, symbol, relevance):
 202      global DB
 203  
 204      if DB == None:
 205          openMySQL()
 206      if DB == None:
 207          return -1
 208      if name == None:
 209          return -1
 210      if symbol == None:
 211          return -1
 212  
 213      c = DB.cursor()
 214      try:
 215  	ret = c.execute(
 216  """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
 217  		(name, symbol, relevance))
 218      except:
 219          try:
 220  	    ret = c.execute(
 221      """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
 222  		    (relevance, name, symbol))
 223  	except:
 224  	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
 225  	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
 226  	    print sys.exc_type, sys.exc_value
 227  	    return -1
 228  	     
 229      return ret
 230  
 231  def updateSymbol(name, module, type, desc):
 232      global DB
 233  
 234      updateWord(name, name, 50)
 235      if DB == None:
 236          openMySQL()
 237      if DB == None:
 238          return -1
 239      if name == None:
 240          return -1
 241      if module == None:
 242          return -1
 243      if type == None:
 244          return -1
 245  
 246      try:
 247  	desc = string.replace(desc, "'", " ")
 248  	l = string.split(desc, ".")
 249  	desc = l[0]
 250  	desc = desc[0:99]
 251      except:
 252          desc = ""
 253  
 254      c = DB.cursor()
 255      try:
 256  	ret = c.execute(
 257  """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
 258                      (name, module, type, desc))
 259      except:
 260          try:
 261  	    ret = c.execute(
 262  """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
 263                      (module, type, desc, name))
 264          except:
 265  	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 266  	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
 267  	    print sys.exc_type, sys.exc_value
 268  	    return -1
 269  	     
 270      return ret
 271          
 272  def addFunction(name, module, desc = ""):
 273      return updateSymbol(name, module, 'function', desc)
 274  
 275  def addMacro(name, module, desc = ""):
 276      return updateSymbol(name, module, 'macro', desc)
 277  
 278  def addEnum(name, module, desc = ""):
 279      return updateSymbol(name, module, 'enum', desc)
 280  
 281  def addStruct(name, module, desc = ""):
 282      return updateSymbol(name, module, 'struct', desc)
 283  
 284  def addConst(name, module, desc = ""):
 285      return updateSymbol(name, module, 'const', desc)
 286  
 287  def addType(name, module, desc = ""):
 288      return updateSymbol(name, module, 'type', desc)
 289  
 290  def addFunctype(name, module, desc = ""):
 291      return updateSymbol(name, module, 'functype', desc)
 292  
 293  def addPage(resource, title):
 294      global DB
 295  
 296      if DB == None:
 297          openMySQL()
 298      if DB == None:
 299          return -1
 300      if resource == None:
 301          return -1
 302  
 303      c = DB.cursor()
 304      try:
 305  	ret = c.execute(
 306  	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
 307                      (resource, title))
 308      except:
 309          try:
 310  	    ret = c.execute(
 311  		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
 312                      (title, resource))
 313          except:
 314  	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 315  	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
 316  	    print sys.exc_type, sys.exc_value
 317  	    return -1
 318  	     
 319      return ret
 320  
 321  def updateWordHTML(name, resource, desc, id, relevance):
 322      global DB
 323  
 324      if DB == None:
 325          openMySQL()
 326      if DB == None:
 327          return -1
 328      if name == None:
 329          return -1
 330      if resource == None:
 331          return -1
 332      if id == None:
 333          id = ""
 334      if desc == None:
 335          desc = ""
 336      else:
 337  	try:
 338  	    desc = string.replace(desc, "'", " ")
 339  	    desc = desc[0:99]
 340  	except:
 341  	    desc = ""
 342  
 343      c = DB.cursor()
 344      try:
 345  	ret = c.execute(
 346  """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
 347                      (name, resource, desc, id, relevance))
 348      except:
 349          try:
 350  	    ret = c.execute(
 351  """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
 352                      (desc, id, relevance, name, resource))
 353          except:
 354  	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
 355  	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
 356  	    print sys.exc_type, sys.exc_value
 357  	    return -1
 358  	     
 359      return ret
 360  
 361  def checkXMLMsgArchive(url):
 362      global DB
 363  
 364      if DB == None:
 365          openMySQL()
 366      if DB == None:
 367          return -1
 368      if url == None:
 369          return -1
 370  
 371      c = DB.cursor()
 372      try:
 373  	ret = c.execute(
 374  	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
 375  	row = c.fetchone()
 376  	if row == None:
 377  	    return -1
 378      except:
 379  	return -1
 380  	     
 381      return row[0]
 382      
 383  def addXMLMsgArchive(url, title):
 384      global DB
 385  
 386      if DB == None:
 387          openMySQL()
 388      if DB == None:
 389          return -1
 390      if url == None:
 391          return -1
 392      if title == None:
 393          title = ""
 394      else:
 395  	title = string.replace(title, "'", " ")
 396  	title = title[0:99]
 397  
 398      c = DB.cursor()
 399      try:
 400          cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
 401          ret = c.execute(cmd)
 402  	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
 403          ret = c.execute(cmd)
 404  	row = c.fetchone()
 405  	if row == None:
 406  	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
 407  	    return -1
 408      except:
 409          print "addXMLMsgArchive failed command: %s" % (cmd)
 410  	return -1
 411  	     
 412      return((int)(row[0]))
 413  
 414  def updateWordArchive(name, id, relevance):
 415      global DB
 416  
 417      if DB == None:
 418          openMySQL()
 419      if DB == None:
 420          return -1
 421      if name == None:
 422          return -1
 423      if id == None:
 424          return -1
 425  
 426      c = DB.cursor()
 427      try:
 428  	ret = c.execute(
 429  """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
 430                      (name, id, relevance))
 431      except:
 432          try:
 433  	    ret = c.execute(
 434  """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
 435                      (relevance, name, id))
 436          except:
 437  	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
 438  	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
 439  	    print sys.exc_type, sys.exc_value
 440  	    return -1
 441  
 442      return ret
 443  
 444  #########################################################################
 445  #									#
 446  #                  Word dictionary and analysis routines		#
 447  #									#
 448  #########################################################################
 449  
 450  #
 451  # top 100 english word without the one len < 3 + own set
 452  #
 453  dropWords = {
 454      'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
 455      'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
 456      'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
 457      'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
 458      'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
 459      'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
 460      'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
 461      'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
 462      'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
 463      'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
 464      'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
 465      'down':0,
 466      'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
 467  }
 468  
 469  wordsDict = {}
 470  wordsDictHTML = {}
 471  wordsDictArchive = {}
 472  
 473  def cleanupWordsString(str):
 474      str = string.replace(str, ".", " ")
 475      str = string.replace(str, "!", " ")
 476      str = string.replace(str, "?", " ")
 477      str = string.replace(str, ",", " ")
 478      str = string.replace(str, "'", " ")
 479      str = string.replace(str, '"', " ")
 480      str = string.replace(str, ";", " ")
 481      str = string.replace(str, "(", " ")
 482      str = string.replace(str, ")", " ")
 483      str = string.replace(str, "{", " ")
 484      str = string.replace(str, "}", " ")
 485      str = string.replace(str, "<", " ")
 486      str = string.replace(str, ">", " ")
 487      str = string.replace(str, "=", " ")
 488      str = string.replace(str, "/", " ")
 489      str = string.replace(str, "*", " ")
 490      str = string.replace(str, ":", " ")
 491      str = string.replace(str, "#", " ")
 492      str = string.replace(str, "\\", " ")
 493      str = string.replace(str, "\n", " ")
 494      str = string.replace(str, "\r", " ")
 495      str = string.replace(str, "\xc2", " ")
 496      str = string.replace(str, "\xa0", " ")
 497      return str
 498      
 499  def cleanupDescrString(str):
 500      str = string.replace(str, "'", " ")
 501      str = string.replace(str, "\n", " ")
 502      str = string.replace(str, "\r", " ")
 503      str = string.replace(str, "\xc2", " ")
 504      str = string.replace(str, "\xa0", " ")
 505      l = string.split(str)
 506      str = string.join(str)
 507      return str
 508  
 509  def splitIdentifier(str):
 510      ret = []
 511      while str != "":
 512          cur = string.lower(str[0])
 513  	str = str[1:]
 514  	if ((cur < 'a') or (cur > 'z')):
 515  	    continue
 516  	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
 517  	    cur = cur + string.lower(str[0])
 518  	    str = str[1:]
 519  	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
 520  	    cur = cur + str[0]
 521  	    str = str[1:]
 522  	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
 523  	    str = str[1:]
 524  	ret.append(cur)
 525      return ret
 526  
 527  def addWord(word, module, symbol, relevance):
 528      global wordsDict
 529  
 530      if word == None or len(word) < 3:
 531          return -1
 532      if module == None or symbol == None:
 533          return -1
 534      if dropWords.has_key(word):
 535          return 0
 536      if ord(word[0]) > 0x80:
 537          return 0
 538  
 539      if wordsDict.has_key(word):
 540          d = wordsDict[word]
 541  	if d == None:
 542  	    return 0
 543  	if len(d) > 500:
 544  	    wordsDict[word] = None
 545  	    return 0
 546  	try:
 547  	    relevance = relevance + d[(module, symbol)]
 548  	except:
 549  	    pass
 550      else:
 551          wordsDict[word] = {}
 552      wordsDict[word][(module, symbol)] = relevance
 553      return relevance
 554      
 555  def addString(str, module, symbol, relevance):
 556      if str == None or len(str) < 3:
 557          return -1
 558      ret = 0
 559      str = cleanupWordsString(str)
 560      l = string.split(str)
 561      for word in l:
 562  	if len(word) > 2:
 563  	    ret = ret + addWord(word, module, symbol, 5)
 564  
 565      return ret
 566  
 567  def addWordHTML(word, resource, id, section, relevance):
 568      global wordsDictHTML
 569  
 570      if word == None or len(word) < 3:
 571          return -1
 572      if resource == None or section == None:
 573          return -1
 574      if dropWords.has_key(word):
 575          return 0
 576      if ord(word[0]) > 0x80:
 577          return 0
 578  
 579      section = cleanupDescrString(section)
 580  
 581      if wordsDictHTML.has_key(word):
 582          d = wordsDictHTML[word]
 583  	if d == None:
 584  	    print "skipped %s" % (word)
 585  	    return 0
 586  	try:
 587  	    (r,i,s) = d[resource]
 588  	    if i != None:
 589  	        id = i
 590  	    if s != None:
 591  	        section = s
 592  	    relevance = relevance + r
 593  	except:
 594  	    pass
 595      else:
 596          wordsDictHTML[word] = {}
 597      d = wordsDictHTML[word];
 598      d[resource] = (relevance, id, section)
 599      return relevance
 600      
 601  def addStringHTML(str, resource, id, section, relevance):
 602      if str == None or len(str) < 3:
 603          return -1
 604      ret = 0
 605      str = cleanupWordsString(str)
 606      l = string.split(str)
 607      for word in l:
 608  	if len(word) > 2:
 609  	    try:
 610  		r = addWordHTML(word, resource, id, section, relevance)
 611  		if r < 0:
 612  		    print "addWordHTML failed: %s %s" % (word, resource)
 613  		ret = ret + r
 614  	    except:
 615  		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
 616  		print sys.exc_type, sys.exc_value
 617  
 618      return ret
 619  
 620  def addWordArchive(word, id, relevance):
 621      global wordsDictArchive
 622  
 623      if word == None or len(word) < 3:
 624          return -1
 625      if id == None or id == -1:
 626          return -1
 627      if dropWords.has_key(word):
 628          return 0
 629      if ord(word[0]) > 0x80:
 630          return 0
 631  
 632      if wordsDictArchive.has_key(word):
 633          d = wordsDictArchive[word]
 634  	if d == None:
 635  	    print "skipped %s" % (word)
 636  	    return 0
 637  	try:
 638  	    r = d[id]
 639  	    relevance = relevance + r
 640  	except:
 641  	    pass
 642      else:
 643          wordsDictArchive[word] = {}
 644      d = wordsDictArchive[word];
 645      d[id] = relevance
 646      return relevance
 647      
 648  def addStringArchive(str, id, relevance):
 649      if str == None or len(str) < 3:
 650          return -1
 651      ret = 0
 652      str = cleanupWordsString(str)
 653      l = string.split(str)
 654      for word in l:
 655          i = len(word)
 656  	if i > 2:
 657  	    try:
 658  		r = addWordArchive(word, id, relevance)
 659  		if r < 0:
 660  		    print "addWordArchive failed: %s %s" % (word, id)
 661  		else:
 662  		    ret = ret + r
 663  	    except:
 664  		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
 665  		print sys.exc_type, sys.exc_value
 666      return ret
 667  
 668  #########################################################################
 669  #									#
 670  #                  XML API description analysis				#
 671  #									#
 672  #########################################################################
 673  
 674  def loadAPI(filename):
 675      doc = libxml2.parseFile(filename)
 676      print "loaded %s" % (filename)
 677      return doc
 678  
 679  def foundExport(file, symbol):
 680      if file == None:
 681          return 0
 682      if symbol == None:
 683          return 0
 684      addFunction(symbol, file)
 685      l = splitIdentifier(symbol)
 686      for word in l:
 687  	addWord(word, file, symbol, 10)
 688      return 1
 689       
 690  def analyzeAPIFile(top):
 691      count = 0
 692      name = top.prop("name")
 693      cur = top.children
 694      while cur != None:
 695          if cur.type == 'text':
 696  	    cur = cur.next
 697  	    continue
 698  	if cur.name == "exports":
 699  	    count = count + foundExport(name, cur.prop("symbol"))
 700  	else:
 701  	    print "unexpected element %s in API doc <file name='%s'>" % (name)
 702          cur = cur.next
 703      return count
 704  
 705  def analyzeAPIFiles(top):
 706      count = 0
 707      cur = top.children
 708          
 709      while cur != None:
 710          if cur.type == 'text':
 711  	    cur = cur.next
 712  	    continue
 713  	if cur.name == "file":
 714  	    count = count + analyzeAPIFile(cur)
 715  	else:
 716  	    print "unexpected element %s in API doc <files>" % (cur.name)
 717          cur = cur.next
 718      return count
 719  
 720  def analyzeAPIEnum(top):
 721      file = top.prop("file")
 722      if file == None:
 723          return 0
 724      symbol = top.prop("name")
 725      if symbol == None:
 726          return 0
 727  
 728      addEnum(symbol, file)
 729      l = splitIdentifier(symbol)
 730      for word in l:
 731  	addWord(word, file, symbol, 10)
 732  
 733      return 1
 734  
 735  def analyzeAPIConst(top):
 736      file = top.prop("file")
 737      if file == None:
 738          return 0
 739      symbol = top.prop("name")
 740      if symbol == None:
 741          return 0
 742  
 743      addConst(symbol, file)
 744      l = splitIdentifier(symbol)
 745      for word in l:
 746  	addWord(word, file, symbol, 10)
 747  
 748      return 1
 749  
 750  def analyzeAPIType(top):
 751      file = top.prop("file")
 752      if file == None:
 753          return 0
 754      symbol = top.prop("name")
 755      if symbol == None:
 756          return 0
 757  
 758      addType(symbol, file)
 759      l = splitIdentifier(symbol)
 760      for word in l:
 761  	addWord(word, file, symbol, 10)
 762      return 1
 763  
 764  def analyzeAPIFunctype(top):
 765      file = top.prop("file")
 766      if file == None:
 767          return 0
 768      symbol = top.prop("name")
 769      if symbol == None:
 770          return 0
 771  
 772      addFunctype(symbol, file)
 773      l = splitIdentifier(symbol)
 774      for word in l:
 775  	addWord(word, file, symbol, 10)
 776      return 1
 777  
 778  def analyzeAPIStruct(top):
 779      file = top.prop("file")
 780      if file == None:
 781          return 0
 782      symbol = top.prop("name")
 783      if symbol == None:
 784          return 0
 785  
 786      addStruct(symbol, file)
 787      l = splitIdentifier(symbol)
 788      for word in l:
 789  	addWord(word, file, symbol, 10)
 790  
 791      info = top.prop("info")
 792      if info != None:
 793  	info = string.replace(info, "'", " ")
 794  	info = string.strip(info)
 795  	l = string.split(info)
 796  	for word in l:
 797  	    if len(word) > 2:
 798  		addWord(word, file, symbol, 5)
 799      return 1
 800  
 801  def analyzeAPIMacro(top):
 802      file = top.prop("file")
 803      if file == None:
 804          return 0
 805      symbol = top.prop("name")
 806      if symbol == None:
 807          return 0
 808      symbol = string.replace(symbol, "'", " ")
 809      symbol = string.strip(symbol)
 810  
 811      info = None
 812      cur = top.children
 813      while cur != None:
 814          if cur.type == 'text':
 815  	    cur = cur.next
 816  	    continue
 817  	if cur.name == "info":
 818  	    info = cur.content
 819  	    break
 820          cur = cur.next
 821  
 822      l = splitIdentifier(symbol)
 823      for word in l:
 824  	addWord(word, file, symbol, 10)
 825  
 826      if info == None:
 827  	addMacro(symbol, file)
 828          print "Macro %s description has no <info>" % (symbol)
 829          return 0
 830  
 831      info = string.replace(info, "'", " ")
 832      info = string.strip(info)
 833      addMacro(symbol, file, info)
 834      l = string.split(info)
 835      for word in l:
 836  	if len(word) > 2:
 837  	    addWord(word, file, symbol, 5)
 838      return 1
 839  
 840  def analyzeAPIFunction(top):
 841      file = top.prop("file")
 842      if file == None:
 843          return 0
 844      symbol = top.prop("name")
 845      if symbol == None:
 846          return 0
 847  
 848      symbol = string.replace(symbol, "'", " ")
 849      symbol = string.strip(symbol)
 850      info = None
 851      cur = top.children
 852      while cur != None:
 853          if cur.type == 'text':
 854  	    cur = cur.next
 855  	    continue
 856  	if cur.name == "info":
 857  	    info = cur.content
 858  	elif cur.name == "return":
 859  	    rinfo = cur.prop("info")
 860  	    if rinfo != None:
 861  		rinfo = string.replace(rinfo, "'", " ")
 862  		rinfo = string.strip(rinfo)
 863  	        addString(rinfo, file, symbol, 7)
 864  	elif cur.name == "arg":
 865  	    ainfo = cur.prop("info")
 866  	    if ainfo != None:
 867  		ainfo = string.replace(ainfo, "'", " ")
 868  		ainfo = string.strip(ainfo)
 869  	        addString(ainfo, file, symbol, 5)
 870  	    name = cur.prop("name")
 871  	    if name != None:
 872  		name = string.replace(name, "'", " ")
 873  		name = string.strip(name)
 874  	        addWord(name, file, symbol, 7)
 875          cur = cur.next
 876      if info == None:
 877          print "Function %s description has no <info>" % (symbol)
 878  	addFunction(symbol, file, "")
 879      else:
 880          info = string.replace(info, "'", " ")
 881  	info = string.strip(info)
 882  	addFunction(symbol, file, info)
 883          addString(info, file, symbol, 5)
 884  
 885      l = splitIdentifier(symbol)
 886      for word in l:
 887  	addWord(word, file, symbol, 10)
 888  
 889      return 1
 890  
 891  def analyzeAPISymbols(top):
 892      count = 0
 893      cur = top.children
 894          
 895      while cur != None:
 896          if cur.type == 'text':
 897  	    cur = cur.next
 898  	    continue
 899  	if cur.name == "macro":
 900  	    count = count + analyzeAPIMacro(cur)
 901  	elif cur.name == "function":
 902  	    count = count + analyzeAPIFunction(cur)
 903  	elif cur.name == "const":
 904  	    count = count + analyzeAPIConst(cur)
 905  	elif cur.name == "typedef":
 906  	    count = count + analyzeAPIType(cur)
 907  	elif cur.name == "struct":
 908  	    count = count + analyzeAPIStruct(cur)
 909  	elif cur.name == "enum":
 910  	    count = count + analyzeAPIEnum(cur)
 911  	elif cur.name == "functype":
 912  	    count = count + analyzeAPIFunctype(cur)
 913  	else:
 914  	    print "unexpected element %s in API doc <files>" % (cur.name)
 915          cur = cur.next
 916      return count
 917  
 918  def analyzeAPI(doc):
 919      count = 0
 920      if doc == None:
 921          return -1
 922      root = doc.getRootElement()
 923      if root.name != "api":
 924          print "Unexpected root name"
 925          return -1
 926      cur = root.children
 927      while cur != None:
 928          if cur.type == 'text':
 929  	    cur = cur.next
 930  	    continue
 931  	if cur.name == "files":
 932  	    pass
 933  #	    count = count + analyzeAPIFiles(cur)
 934  	elif cur.name == "symbols":
 935  	    count = count + analyzeAPISymbols(cur)
 936  	else:
 937  	    print "unexpected element %s in API doc" % (cur.name)
 938          cur = cur.next
 939      return count
 940  
 941  #########################################################################
 942  #									#
 943  #                  Web pages parsing and analysis			#
 944  #									#
 945  #########################################################################
 946  
 947  import glob
 948  
 949  def analyzeHTMLText(doc, resource, p, section, id):
 950      words = 0
 951      try:
 952  	content = p.content
 953  	words = words + addStringHTML(content, resource, id, section, 5)
 954      except:
 955          return -1
 956      return words
 957  
 958  def analyzeHTMLPara(doc, resource, p, section, id):
 959      words = 0
 960      try:
 961  	content = p.content
 962  	words = words + addStringHTML(content, resource, id, section, 5)
 963      except:
 964          return -1
 965      return words
 966  
 967  def analyzeHTMLPre(doc, resource, p, section, id):
 968      words = 0
 969      try:
 970  	content = p.content
 971  	words = words + addStringHTML(content, resource, id, section, 5)
 972      except:
 973          return -1
 974      return words
 975  
 976  def analyzeHTML(doc, resource, p, section, id):
 977      words = 0
 978      try:
 979  	content = p.content
 980  	words = words + addStringHTML(content, resource, id, section, 5)
 981      except:
 982          return -1
 983      return words
 984  
 985  def analyzeHTML(doc, resource):
 986      para = 0;
 987      ctxt = doc.xpathNewContext()
 988      try:
 989  	res = ctxt.xpathEval("//head/title")
 990  	title = res[0].content
 991      except:
 992          title = "Page %s" % (resource)
 993      addPage(resource, title)
 994      try:
 995  	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
 996  	section = title
 997  	id = ""
 998  	for item in items:
 999  	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000  	        section = item.content
1001  		if item.prop("id"):
1002  		    id = item.prop("id")
1003  		elif item.prop("name"):
1004  		    id = item.prop("name")
1005  	    elif item.type == 'text':
1006  	        analyzeHTMLText(doc, resource, item, section, id)
1007  		para = para + 1
1008  	    elif item.name == 'p':
1009  	        analyzeHTMLPara(doc, resource, item, section, id)
1010  		para = para + 1
1011  	    elif item.name == 'pre':
1012  	        analyzeHTMLPre(doc, resource, item, section, id)
1013  		para = para + 1
1014  	    else:
1015  	        print "Page %s, unexpected %s element" % (resource, item.name)
1016      except:
1017          print "Page %s: problem analyzing" % (resource)
1018  	print sys.exc_type, sys.exc_value
1019  
1020      return para
1021  
1022  def analyzeHTMLPages():
1023      ret = 0
1024      HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025      for html in HTMLfiles:
1026  	if html[0:3] == "API":
1027  	    continue
1028  	if html == "xml.html":
1029  	    continue
1030  	try:
1031  	    doc = libxml2.parseFile(html)
1032  	except:
1033  	    doc = libxml2.htmlParseFile(html, None)
1034  	try:
1035  	    res = analyzeHTML(doc, html)
1036  	    print "Parsed %s : %d paragraphs" % (html, res)
1037  	    ret = ret + 1
1038  	except:
1039  	    print "could not parse %s" % (html)
1040      return ret
1041  
1042  #########################################################################
1043  #									#
1044  #                  Mail archives parsing and analysis			#
1045  #									#
1046  #########################################################################
1047  
1048  import time
1049  
1050  def getXMLDateArchive(t = None):
1051      if t == None:
1052  	t = time.time()
1053      T = time.gmtime(t)
1054      month = time.strftime("%B", T)
1055      year = T[0]
1056      url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057      return url
1058  
1059  def scanXMLMsgArchive(url, title, force = 0):
1060      if url == None or title == None:
1061          return 0
1062  
1063      ID = checkXMLMsgArchive(url)
1064      if force == 0 and ID != -1:
1065          return 0
1066  
1067      if ID == -1:
1068  	ID = addXMLMsgArchive(url, title)
1069  	if ID == -1:
1070  	    return 0
1071  
1072      try:
1073          print "Loading %s" % (url)
1074          doc = libxml2.htmlParseFile(url, None);
1075      except:
1076          doc = None
1077      if doc == None:
1078          print "Failed to parse %s" % (url)
1079  	return 0
1080  
1081      addStringArchive(title, ID, 20)
1082      ctxt = doc.xpathNewContext()
1083      texts = ctxt.xpathEval("//pre//text()")
1084      for text in texts:
1085          addStringArchive(text.content, ID, 5)
1086  
1087      return 1
1088  
1089  def scanXMLDateArchive(t = None, force = 0):
1090      global wordsDictArchive
1091  
1092      wordsDictArchive = {}
1093  
1094      url = getXMLDateArchive(t)
1095      print "loading %s" % (url)
1096      try:
1097  	doc = libxml2.htmlParseFile(url, None);
1098      except:
1099          doc = None
1100      if doc == None:
1101          print "Failed to parse %s" % (url)
1102  	return -1
1103      ctxt = doc.xpathNewContext()
1104      anchors = ctxt.xpathEval("//a[@href]")
1105      links = 0
1106      newmsg = 0
1107      for anchor in anchors:
1108  	href = anchor.prop("href")
1109  	if href == None or href[0:3] != "msg":
1110  	    continue
1111          try:
1112  	    links = links + 1
1113  
1114  	    msg = libxml2.buildURI(href, url)
1115  	    title = anchor.content
1116  	    if title != None and title[0:4] == 'Re: ':
1117  	        title = title[4:]
1118  	    if title != None and title[0:6] == '[xml] ':
1119  	        title = title[6:]
1120  	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121  
1122  	except:
1123  	    pass
1124  
1125      return newmsg
1126      
1127  
1128  #########################################################################
1129  #									#
1130  #          Main code: open the DB, the API XML and analyze it		#
1131  #									#
1132  #########################################################################
1133  def analyzeArchives(t = None, force = 0):
1134      global wordsDictArchive
1135  
1136      ret = scanXMLDateArchive(t, force)
1137      print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138  
1139      i = 0
1140      skipped = 0
1141      for word in wordsDictArchive.keys():
1142  	refs = wordsDictArchive[word]
1143  	if refs  == None:
1144  	    skipped = skipped + 1
1145  	    continue;
1146  	for id in refs.keys():
1147  	    relevance = refs[id]
1148  	    updateWordArchive(word, id, relevance)
1149  	    i = i + 1
1150  
1151      print "Found %d associations in HTML pages" % (i)
1152  
1153  def analyzeHTMLTop():
1154      global wordsDictHTML
1155  
1156      ret = analyzeHTMLPages()
1157      print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158  
1159      i = 0
1160      skipped = 0
1161      for word in wordsDictHTML.keys():
1162  	refs = wordsDictHTML[word]
1163  	if refs  == None:
1164  	    skipped = skipped + 1
1165  	    continue;
1166  	for resource in refs.keys():
1167  	    (relevance, id, section) = refs[resource]
1168  	    updateWordHTML(word, resource, section, id, relevance)
1169  	    i = i + 1
1170  
1171      print "Found %d associations in HTML pages" % (i)
1172  
1173  def analyzeAPITop():
1174      global wordsDict
1175      global API
1176  
1177      try:
1178  	doc = loadAPI(API)
1179  	ret = analyzeAPI(doc)
1180  	print "Analyzed %d blocs" % (ret)
1181  	doc.freeDoc()
1182      except:
1183  	print "Failed to parse and analyze %s" % (API)
1184  	print sys.exc_type, sys.exc_value
1185  	sys.exit(1)
1186  
1187      print "Indexed %d words" % (len(wordsDict))
1188      i = 0
1189      skipped = 0
1190      for word in wordsDict.keys():
1191  	refs = wordsDict[word]
1192  	if refs  == None:
1193  	    skipped = skipped + 1
1194  	    continue;
1195  	for (module, symbol) in refs.keys():
1196  	    updateWord(word, symbol, refs[(module, symbol)])
1197  	    i = i + 1
1198  
1199      print "Found %d associations, skipped %d words" % (i, skipped)
1200  
1201  def usage():
1202      print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1203      sys.exit(1)
1204  
1205  def main():
1206      try:
1207  	openMySQL()
1208      except:
1209  	print "Failed to open the database"
1210  	print sys.exc_type, sys.exc_value
1211  	sys.exit(1)
1212  
1213      args = sys.argv[1:]
1214      force = 0
1215      if args:
1216          i = 0
1217  	while i < len(args):
1218  	    if args[i] == '--force':
1219  	        force = 1
1220  	    elif args[i] == '--archive':
1221  	        analyzeArchives(None, force)
1222  	    elif args[i] == '--archive-year':
1223  	        i = i + 1;
1224  		year = args[i]
1225  		months = ["January" , "February", "March", "April", "May",
1226  			  "June", "July", "August", "September", "October",
1227  			  "November", "December"];
1228  	        for month in months:
1229  		    try:
1230  		        str = "%s-%s" % (year, month)
1231  			T = time.strptime(str, "%Y-%B")
1232  			t = time.mktime(T) + 3600 * 24 * 10;
1233  			analyzeArchives(t, force)
1234  		    except:
1235  			print "Failed to index month archive:"
1236  			print sys.exc_type, sys.exc_value
1237  	    elif args[i] == '--archive-month':
1238  	        i = i + 1;
1239  		month = args[i]
1240  		try:
1241  		    T = time.strptime(month, "%Y-%B")
1242  		    t = time.mktime(T) + 3600 * 24 * 10;
1243  		    analyzeArchives(t, force)
1244  		except:
1245  		    print "Failed to index month archive:"
1246  		    print sys.exc_type, sys.exc_value
1247  	    elif args[i] == '--API':
1248  	        analyzeAPITop()
1249  	    elif args[i] == '--docs':
1250  	        analyzeHTMLTop()
1251  	    else:
1252  	        usage()
1253  	    i = i + 1
1254      else:
1255          usage()
1256  
1257  if __name__ == "__main__":
1258      main()