genUnicode.py

   1 #!/usr/bin/python -u
   2 #
   3 # Original script modified in November 2003 to take advantage of
   4 # the character-validation range routines, and updated to the
   5 # current Unicode information (Version 4.0.1)
   6 #
   7 # NOTE: there is an 'alias' facility for blocks which are not present in
   8 #       the current release, but are needed for ABI compatibility.  This
   9 #       must be accomplished MANUALLY!  Please see the comments below under
  10 #     'blockAliases'
  11 #
  12 import sys
  13 import string
  14 import time
  15
  16 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
  17 sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
  18
  19 #
  20 # blockAliases is a small hack - it is used for mapping block names which
  21 # were were used in the 3.1 release, but are missing or changed in the current
  22 # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
  23 blockAliases = []
  24 blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
  25 blockAliases.append("Greek:GreekandCoptic")
  26 blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
  27         "SupplementaryPrivateUseArea-B")
  28
  29 # minTableSize gives the minimum number of ranges which must be present
  30 # before a range table is produced.  If there are less than this
  31 # number, inline comparisons are generated
  32 minTableSize = 8
  33
  34 (blockfile, catfile) = string.split(sources)
  35
  36
  37 #
  38 # Now process the "blocks" file, reducing it to a dictionary
  39 # indexed by blockname, containing a tuple with the applicable
  40 # block range
  41 #
  42 BlockNames = {}
  43 try:
  44     blocks = open(blockfile, "r")
  45 except:
  46     print "Missing %s, aborting ..." % blockfile
  47     sys.exit(1)
  48
  49 for line in blocks.readlines():
  50     if line[0] == '#':
  51         continue
  52     line = string.strip(line)
  53     if line == '':
  54         continue
  55     try:
  56         fields = string.split(line, ';')
  57         range = string.strip(fields[0])
  58         (start, end) = string.split(range, "..")
  59         name = string.strip(fields[1])
  60         name = string.replace(name, ' ', '')
  61     except:
  62         print "Failed to process line: %s" % (line)
  63         continue
  64     start = "0x" + start
  65     end = "0x" + end
  66     try:
  67         BlockNames[name].append((start, end))
  68     except:
  69         BlockNames[name] = [(start, end)]
  70 blocks.close()
  71 print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
  72
  73 for block in blockAliases:
  74     alias = string.split(block,':')
  75     alist = string.split(alias[1],',')
  76     for comp in alist:
  77         if BlockNames.has_key(comp):
  78             if alias[0] not in BlockNames:
  79                 BlockNames[alias[0]] = []
  80             for r in BlockNames[comp]:
  81                 BlockNames[alias[0]].append(r)
  82         else:
  83             print "Alias %s: %s not in Blocks" % (alias[0], comp)
  84             continue
  85
  86 #
  87 # Next process the Categories file. This is more complex, since
  88 # the file is in code sequence, and we need to invert it.  We use
  89 # a dictionary with index category-name, with each entry containing
  90 # all the ranges (codepoints) of that category.  Note that category
  91 # names comprise two parts - the general category, and the "subclass"
  92 # within that category.  Therefore, both "general category" (which is
  93 # the first character of the 2-character category-name) and the full
  94 # (2-character) name are entered into this dictionary.
  95 #
  96 try:
  97     data = open(catfile, "r")
  98 except:
  99     print "Missing %s, aborting ..." % catfile
 100     sys.exit(1)
 101
 102 nbchar = 0;
 103 Categories = {}
 104 for line in data.readlines():
 105     if line[0] == '#':
 106         continue
 107     line = string.strip(line)
 108     if line == '':
 109         continue
 110     try:
 111         fields = string.split(line, ';')
 112         point = string.strip(fields[0])
 113         value = 0
 114         while point != '':
 115             value = value * 16
 116             if point[0] >= '0' and point[0] <= '9':
 117                 value = value + ord(point[0]) - ord('0')
 118             elif point[0] >= 'A' and point[0] <= 'F':
 119                 value = value + 10 + ord(point[0]) - ord('A')
 120             elif point[0] >= 'a' and point[0] <= 'f':
 121                 value = value + 10 + ord(point[0]) - ord('a')
 122             point = point[1:]
 123         name = fields[2]
 124     except:
 125         print "Failed to process line: %s" % (line)
 126         continue
 127
 128     nbchar = nbchar + 1
 129     # update entry for "full name"
 130     try:
 131         Categories[name].append(value)
 132     except:
 133         try:
 134             Categories[name] = [value]
 135         except:
 136             print "Failed to process line: %s" % (line)
 137     # update "general category" name
 138     try:
 139         Categories[name[0]].append(value)
 140     except:
 141         try:
 142             Categories[name[0]] = [value]
 143         except:
 144             print "Failed to process line: %s" % (line)
 145
 146 blocks.close()
 147 print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
 148
 149 #
 150 # The data is now all read.  Time to process it into a more useful form.
 151 #
 152 # reduce the number list into ranges
 153 for cat in Categories.keys():
 154     list = Categories[cat]
 155     start = -1
 156     prev = -1
 157     end = -1
 158     ranges = []
 159     for val in list:
 160         if start == -1:
 161             start = val
 162             prev = val
 163             continue
 164         elif val == prev + 1:
 165             prev = val
 166             continue
 167         elif prev == start:
 168             ranges.append((prev, prev))
 169             start = val
 170             prev = val
 171             continue
 172         else:
 173             ranges.append((start, prev))
 174             start = val
 175             prev = val
 176             continue
 177     if prev == start:
 178         ranges.append((prev, prev))
 179     else:
 180         ranges.append((start, prev))
 181     Categories[cat] = ranges
 182
 183 #
 184 # Assure all data is in alphabetic order, since we will be doing binary
 185 # searches on the tables.
 186 #
 187 bkeys = BlockNames.keys()
 188 bkeys.sort()
 189
 190 ckeys = Categories.keys()
 191 ckeys.sort()
 192
 193 #
 194 # Generate the resulting files
 195 #
 196 try:
 197     header = open("include/libxml/xmlunicode.h", "w")
 198 except:
 199     print "Failed to open include/libxml/xmlunicode.h"
 200     sys.exit(1)
 201
 202 try:
 203     output = open("xmlunicode.c", "w")
 204 except:
 205     print "Failed to open xmlunicode.c"
 206     sys.exit(1)
 207
 208 date = time.asctime(time.localtime(time.time()))
 209
 210 header.write(
 211 """/*
 212  * Summary: Unicode character APIs
 213  * Description: API for the Unicode character APIs
 214  *
 215  * This file is automatically generated from the
 216  * UCS description files of the Unicode Character Database
 217  * %s
 218  * using the genUnicode.py Python script.
 219  *
 220  * Generation date: %s
 221  * Sources: %s
 222  * Author: Daniel Veillard
 223  */
 224
 225 #ifndef __XML_UNICODE_H__
 226 #define __XML_UNICODE_H__
 227
 228 #include <libxml/xmlversion.h>
 229
 230 #ifdef LIBXML_UNICODE_ENABLED
 231
 232 #ifdef __cplusplus
 233 extern "C" {
 234 #endif
 235
 236 """ % (webpage, date, sources));
 237
 238 output.write(
 239 """/*
 240  * xmlunicode.c: this module implements the Unicode character APIs
 241  *
 242  * This file is automatically generated from the
 243  * UCS description files of the Unicode Character Database
 244  * %s
 245  * using the genUnicode.py Python script.
 246  *
 247  * Generation date: %s
 248  * Sources: %s
 249  * Daniel Veillard <veillard@redhat.com>
 250  */
 251
 252 #define IN_LIBXML
 253 #include "libxml.h"
 254
 255 #ifdef LIBXML_UNICODE_ENABLED
 256
 257 #include <string.h>
 258 #include <libxml/xmlversion.h>
 259 #include <libxml/xmlunicode.h>
 260 #include <libxml/chvalid.h>
 261
 262 typedef int (xmlIntFunc)(int);  /* just to keep one's mind untwisted */
 263
 264 typedef struct {
 265     const char *rangename;
 266     xmlIntFunc *func;
 267 } xmlUnicodeRange;
 268
 269 typedef struct {
 270     const xmlUnicodeRange *table;
 271     int             numentries;
 272 } xmlUnicodeNameTable;
 273
 274
 275 static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
 276
 277 static const xmlUnicodeRange xmlUnicodeBlocks[] = {
 278 """ % (webpage, date, sources));
 279
 280 flag = 0
 281 for block in bkeys:
 282     name = string.replace(block, '-', '')
 283     if flag:
 284         output.write(',\n')
 285     else:
 286         flag = 1
 287     output.write('  {"%s", xmlUCSIs%s}' % (block, name))
 288 output.write('};\n\n')
 289
 290 output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
 291 flag = 0;
 292 for name in ckeys:
 293     if flag:
 294         output.write(',\n')
 295     else:
 296         flag = 1
 297     output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
 298 output.write('};\n\n')
 299
 300 #
 301 # For any categories with more than minTableSize ranges we generate
 302 # a range table suitable for xmlCharInRange
 303 #
 304 for name in ckeys:
 305   if len(Categories[name]) > minTableSize:
 306     numshort = 0
 307     numlong = 0
 308     ranges = Categories[name]
 309     sptr = "NULL"
 310     lptr = "NULL"
 311     for range in ranges:
 312       (low, high) = range
 313       if high < 0x10000:
 314         if numshort == 0:
 315           pline = "static const xmlChSRange xml%sS[] = {" % name
 316           sptr = "xml%sS" % name
 317         else:
 318           pline += ", "
 319         numshort += 1
 320       else:
 321         if numlong == 0:
 322           if numshort > 0:
 323             output.write(pline + " };\n")
 324           pline = "static const xmlChLRange xml%sL[] = {" % name
 325           lptr = "xml%sL" % name
 326         else:
 327           pline += ", "
 328         numlong += 1
 329       if len(pline) > 60:
 330         output.write(pline + "\n")
 331         pline = "    "
 332       pline += "{%s, %s}" % (hex(low), hex(high))
 333     output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
 334          % (name, numshort, numlong, sptr, lptr))
 335
 336
 337 output.write(
 338 """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
 339 static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
 340
 341 /**
 342  * xmlUnicodeLookup:
 343  * @tptr: pointer to the name table
 344  * @name: name to be found
 345  *
 346  * binary table lookup for user-supplied name
 347  *
 348  * Returns pointer to range function if found, otherwise NULL
 349  */
 350 static xmlIntFunc
 351 *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
 352     int low, high, mid, cmp;
 353     xmlUnicodeRange *sptr;
 354
 355     if ((tptr == NULL) || (tname == NULL)) return(NULL);
 356
 357     low = 0;
 358     high = tptr->numentries - 1;
 359     sptr = tptr->table;
 360     while (low <= high) {
 361         mid = (low + high) / 2;
 362         if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
 363             return (sptr[mid].func);
 364         if (cmp < 0)
 365             high = mid - 1;
 366         else
 367             low = mid + 1;
 368     }
 369     return (NULL);
 370 }
 371
 372 """ % (len(BlockNames), len(Categories)) )
 373
 374 for block in bkeys:
 375     name = string.replace(block, '-', '')
 376     header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
 377     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
 378     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
 379                  (block))
 380     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
 381     output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
 382     flag = 0
 383     for (start, end) in BlockNames[block]:
 384         if flag:
 385             output.write(" ||\n           ")
 386         else:
 387             flag = 1
 388         output.write("((code >= %s) && (code <= %s))" % (start, end))
 389     output.write(");\n}\n\n")
 390
 391 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
 392 output.write(
 393 """/**
 394  * xmlUCSIsBlock:
 395  * @code: UCS code point
 396  * @block: UCS block name
 397  *
 398  * Check whether the character is part of the UCS Block
 399  *
 400  * Returns 1 if true, 0 if false and -1 on unknown block
 401  */
 402 int
 403 xmlUCSIsBlock(int code, const char *block) {
 404     xmlIntFunc *func;
 405
 406     func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
 407     if (func == NULL)
 408         return (-1);
 409     return (func(code));
 410 }
 411
 412 """)
 413
 414 for name in ckeys:
 415     ranges = Categories[name]
 416     header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
 417     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
 418     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
 419                  (name))
 420     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
 421     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
 422     if len(Categories[name]) > minTableSize:
 423         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
 424             % name)
 425     else:
 426         start = 1
 427         for range in ranges:
 428             (begin, end) = range;
 429             if start:
 430                 output.write("    return(");
 431                 start = 0
 432             else:
 433                 output.write(" ||\n           ");
 434             if (begin == end):
 435                 output.write("(code == %s)" % (hex(begin)))
 436             else:
 437                 output.write("((code >= %s) && (code <= %s))" % (
 438                          hex(begin), hex(end)))
 439     output.write(");\n}\n\n")
 440
 441 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
 442 output.write(
 443 """/**
 444  * xmlUCSIsCat:
 445  * @code: UCS code point
 446  * @cat: UCS Category name
 447  *
 448  * Check whether the character is part of the UCS Category
 449  *
 450  * Returns 1 if true, 0 if false and -1 on unknown category
 451  */
 452 int
 453 xmlUCSIsCat(int code, const char *cat) {
 454     xmlIntFunc *func;
 455
 456     func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
 457     if (func == NULL)
 458         return (-1);
 459     return (func(code));
 460 }
 461
 462 #define bottom_xmlunicode
 463 #include "elfgcchack.h"
 464 #endif /* LIBXML_UNICODE_ENABLED */
 465 """)
 466
 467 header.write("""
 468 #ifdef __cplusplus
 469 }
 470 #endif
 471
 472 #endif /* LIBXML_UNICODE_ENABLED */
 473
 474 #endif /* __XML_UNICODE_H__ */
 475 """);
 476
 477 header.close()
 478 output.close()