test/shaping/hb_test_tools.py

   1 #!/usr/bin/env python
   2
   3 from __future__ import print_function
   4 import sys, os, re, difflib, unicodedata, errno, cgi
   5 from itertools import *
   6
   7 diff_symbols = "-+=*&^%$#@!~/"
   8 diff_colors = ['red', 'green', 'blue']
   9
  10 try:
  11         unichr = unichr
  12
  13         if sys.maxunicode < 0x10FFFF:
  14                 # workarounds for Python 2 "narrow" builds with UCS2-only support.
  15
  16                 _narrow_unichr = unichr
  17
  18                 def unichr(i):
  19                         """
  20                         Return the unicode character whose Unicode code is the integer 'i'.
  21                         The valid range is 0 to 0x10FFFF inclusive.
  22
  23                         >>> _narrow_unichr(0xFFFF + 1)
  24                         Traceback (most recent call last):
  25                           File "<stdin>", line 1, in ?
  26                         ValueError: unichr() arg not in range(0x10000) (narrow Python build)
  27                         >>> unichr(0xFFFF + 1) == u'\U00010000'
  28                         True
  29                         >>> unichr(1114111) == u'\U0010FFFF'
  30                         True
  31                         >>> unichr(0x10FFFF + 1)
  32                         Traceback (most recent call last):
  33                           File "<stdin>", line 1, in ?
  34                         ValueError: unichr() arg not in range(0x110000)
  35                         """
  36                         try:
  37                                 return _narrow_unichr(i)
  38                         except ValueError:
  39                                 try:
  40                                         padded_hex_str = hex(i)[2:].zfill(8)
  41                                         escape_str = "\\U" + padded_hex_str
  42                                         return escape_str.decode("unicode-escape")
  43                                 except UnicodeDecodeError:
  44                                         raise ValueError('unichr() arg not in range(0x110000)')
  45
  46 except NameError:
  47         unichr = chr
  48
  49 class ColorFormatter:
  50
  51         class Null:
  52                 @staticmethod
  53                 def start_color (c): return ''
  54                 @staticmethod
  55                 def end_color (): return ''
  56                 @staticmethod
  57                 def escape (s): return s
  58                 @staticmethod
  59                 def newline (): return '\n'
  60
  61         class ANSI:
  62                 @staticmethod
  63                 def start_color (c):
  64                         return {
  65                                 'red': '\033[41;37;1m',
  66                                 'green': '\033[42;37;1m',
  67                                 'blue': '\033[44;37;1m',
  68                         }[c]
  69                 @staticmethod
  70                 def end_color ():
  71                         return '\033[m'
  72                 @staticmethod
  73                 def escape (s): return s
  74                 @staticmethod
  75                 def newline (): return '\n'
  76
  77         class HTML:
  78                 @staticmethod
  79                 def start_color (c):
  80                         return '<span style="background:%s">' % c
  81                 @staticmethod
  82                 def end_color ():
  83                         return '</span>'
  84                 @staticmethod
  85                 def escape (s): return cgi.escape (s)
  86                 @staticmethod
  87                 def newline (): return '<br/>\n'
  88
  89         @staticmethod
  90         def Auto (argv = [], out = sys.stdout):
  91                 format = ColorFormatter.ANSI
  92                 if "--format" in argv:
  93                         argv.remove ("--format")
  94                         format = ColorFormatter.ANSI
  95                 if "--format=ansi" in argv:
  96                         argv.remove ("--format=ansi")
  97                         format = ColorFormatter.ANSI
  98                 if "--format=html" in argv:
  99                         argv.remove ("--format=html")
 100                         format = ColorFormatter.HTML
 101                 if "--no-format" in argv:
 102                         argv.remove ("--no-format")
 103                         format = ColorFormatter.Null
 104                 return format
 105
 106
 107 class DiffColorizer:
 108
 109         diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
 110
 111         def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
 112                 self.formatter = formatter
 113                 self.colors = colors
 114                 self.symbols = symbols
 115
 116         def colorize_lines (self, lines):
 117                 lines = (l if l else '' for l in lines)
 118                 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
 119                 oo = ["",""]
 120                 st = [False, False]
 121                 for l in difflib.Differ().compare (*ss):
 122                         if l[0] == '?':
 123                                 continue
 124                         if l[0] == ' ':
 125                                 for i in range(2):
 126                                         if st[i]:
 127                                                 oo[i] += self.formatter.end_color ()
 128                                                 st[i] = False
 129                                 oo = [o + self.formatter.escape (l[2:]) for o in oo]
 130                                 continue
 131                         if l[0] in self.symbols:
 132                                 i = self.symbols.index (l[0])
 133                                 if not st[i]:
 134                                         oo[i] += self.formatter.start_color (self.colors[i])
 135                                         st[i] = True
 136                                 oo[i] += self.formatter.escape (l[2:])
 137                                 continue
 138                 for i in range(2):
 139                         if st[i]:
 140                                 oo[i] += self.formatter.end_color ()
 141                                 st[i] = False
 142                 oo = [o.replace ('\n', '') for o in oo]
 143                 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
 144
 145         def colorize_diff (self, f):
 146                 lines = [None, None]
 147                 for l in f:
 148                         if l[0] not in self.symbols:
 149                                 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
 150                                 continue
 151                         i = self.symbols.index (l[0])
 152                         if lines[i]:
 153                                 # Flush
 154                                 for line in self.colorize_lines (lines):
 155                                         yield line
 156                                 lines = [None, None]
 157                         lines[i] = l[1:]
 158                         if (all (lines)):
 159                                 # Flush
 160                                 for line in self.colorize_lines (lines):
 161                                         yield line
 162                                 lines = [None, None]
 163                 if (any (lines)):
 164                         # Flush
 165                         for line in self.colorize_lines (lines):
 166                                 yield line
 167
 168
 169 class ZipDiffer:
 170
 171         @staticmethod
 172         def diff_files (files, symbols=diff_symbols):
 173                 files = tuple (files) # in case it's a generator, copy it
 174                 try:
 175                         for lines in izip_longest (*files):
 176                                 if all (lines[0] == line for line in lines[1:]):
 177                                         sys.stdout.writelines ([" ", lines[0]])
 178                                         continue
 179
 180                                 for i, l in enumerate (lines):
 181                                         if l:
 182                                                 sys.stdout.writelines ([symbols[i], l])
 183                 except IOError as e:
 184                         if e.errno != errno.EPIPE:
 185                                 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
 186                                 sys.exit (1)
 187
 188
 189 class DiffFilters:
 190
 191         @staticmethod
 192         def filter_failures (f):
 193                 for key, lines in DiffHelpers.separate_test_cases (f):
 194                         lines = list (lines)
 195                         if not DiffHelpers.test_passed (lines):
 196                                 for l in lines: yield l
 197
 198 class Stat:
 199
 200         def __init__ (self):
 201                 self.count = 0
 202                 self.freq = 0
 203
 204         def add (self, test):
 205                 self.count += 1
 206                 self.freq += test.freq
 207
 208 class Stats:
 209
 210         def __init__ (self):
 211                 self.passed = Stat ()
 212                 self.failed = Stat ()
 213                 self.total  = Stat ()
 214
 215         def add (self, test):
 216                 self.total.add (test)
 217                 if test.passed:
 218                         self.passed.add (test)
 219                 else:
 220                         self.failed.add (test)
 221
 222         def mean (self):
 223                 return float (self.passed.count) / self.total.count
 224
 225         def variance (self):
 226                 return (float (self.passed.count) / self.total.count) * \
 227                        (float (self.failed.count) / self.total.count)
 228
 229         def stddev (self):
 230                 return self.variance () ** .5
 231
 232         def zscore (self, population):
 233                 """Calculate the standard score.
 234                    Population is the Stats for population.
 235                    Self is Stats for sample.
 236                    Returns larger absolute value if sample is highly unlikely to be random.
 237                    Anything outside of -3..+3 is very unlikely to be random.
 238                    See: http://en.wikipedia.org/wiki/Standard_score"""
 239
 240                 return (self.mean () - population.mean ()) / population.stddev ()
 241
 242
 243
 244
 245 class DiffSinks:
 246
 247         @staticmethod
 248         def print_stat (f):
 249                 passed = 0
 250                 failed = 0
 251                 # XXX port to Stats, but that would really slow us down here
 252                 for key, lines in DiffHelpers.separate_test_cases (f):
 253                         if DiffHelpers.test_passed (lines):
 254                                 passed += 1
 255                         else:
 256                                 failed += 1
 257                 total = passed + failed
 258                 print ("%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
 259
 260         @staticmethod
 261         def print_ngrams (f, ns=(1,2,3)):
 262                 gens = tuple (Ngram.generator (n) for n in ns)
 263                 allstats = Stats ()
 264                 allgrams = {}
 265                 for key, lines in DiffHelpers.separate_test_cases (f):
 266                         test = Test (lines)
 267                         allstats.add (test)
 268
 269                         for gen in gens:
 270                                 for ngram in gen (test.unicodes):
 271                                         if ngram not in allgrams:
 272                                                 allgrams[ngram] = Stats ()
 273                                         allgrams[ngram].add (test)
 274
 275                 importantgrams = {}
 276                 for ngram, stats in allgrams.iteritems ():
 277                         if stats.failed.count >= 30: # for statistical reasons
 278                                 importantgrams[ngram] = stats
 279                 allgrams = importantgrams
 280                 del importantgrams
 281
 282                 for ngram, stats in allgrams.iteritems ():
 283                         print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
 284
 285
 286
 287 class Test:
 288
 289         def __init__ (self, lines):
 290                 self.freq = 1
 291                 self.passed = True
 292                 self.identifier = None
 293                 self.text = None
 294                 self.unicodes = None
 295                 self.glyphs = None
 296                 for l in lines:
 297                         symbol = l[0]
 298                         if symbol != ' ':
 299                                 self.passed = False
 300                         i = 1
 301                         if ':' in l:
 302                                 i = l.index (':')
 303                                 if not self.identifier:
 304                                         self.identifier = l[1:i]
 305                                 i = i + 2 # Skip colon and space
 306                         j = -1
 307                         if l[j] == '\n':
 308                                 j -= 1
 309                         brackets = l[i] + l[j]
 310                         l = l[i+1:-2]
 311                         if brackets == '()':
 312                                 self.text = l
 313                         elif brackets == '<>':
 314                                 self.unicodes = Unicode.parse (l)
 315                         elif brackets == '[]':
 316                                 # XXX we don't handle failed tests here
 317                                 self.glyphs = l
 318
 319
 320 class DiffHelpers:
 321
 322         @staticmethod
 323         def separate_test_cases (f):
 324                 '''Reads lines from f, and if the lines have identifiers, ie.
 325                    have a colon character, groups them by identifier,
 326                    yielding lists of all lines with the same identifier.'''
 327
 328                 def identifier (l):
 329                         if ':' in l[1:]:
 330                                 return l[1:l.index (':')]
 331                         return l
 332                 return groupby (f, key=identifier)
 333
 334         @staticmethod
 335         def test_passed (lines):
 336                 lines = list (lines)
 337                 # XXX This is a hack, but does the job for now.
 338                 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
 339                 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
 340                 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
 341                 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
 342                 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
 343                 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
 344                 return all (l[0] == ' ' for l in lines)
 345
 346
 347 class FilterHelpers:
 348
 349         @staticmethod
 350         def filter_printer_function (filter_callback):
 351                 def printer (f):
 352                         for line in filter_callback (f):
 353                                 print (line)
 354                 return printer
 355
 356         @staticmethod
 357         def filter_printer_function_no_newline (filter_callback):
 358                 def printer (f):
 359                         for line in filter_callback (f):
 360                                 sys.stdout.writelines ([line])
 361                 return printer
 362
 363
 364 class Ngram:
 365
 366         @staticmethod
 367         def generator (n):
 368
 369                 def gen (f):
 370                         l = []
 371                         for x in f:
 372                                 l.append (x)
 373                                 if len (l) == n:
 374                                         yield tuple (l)
 375                                         l[:1] = []
 376
 377                 gen.n = n
 378                 return gen
 379
 380
 381 class UtilMains:
 382
 383         @staticmethod
 384         def process_multiple_files (callback, mnemonic = "FILE"):
 385
 386                 if "--help" in sys.argv:
 387                         print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
 388                         sys.exit (1)
 389
 390                 try:
 391                         files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
 392                         for s in files:
 393                                 callback (FileHelpers.open_file_or_stdin (s))
 394                 except IOError as e:
 395                         if e.errno != errno.EPIPE:
 396                                 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
 397                                 sys.exit (1)
 398
 399         @staticmethod
 400         def process_multiple_args (callback, mnemonic):
 401
 402                 if len (sys.argv) == 1 or "--help" in sys.argv:
 403                         print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
 404                         sys.exit (1)
 405
 406                 try:
 407                         for s in sys.argv[1:]:
 408                                 callback (s)
 409                 except IOError as e:
 410                         if e.errno != errno.EPIPE:
 411                                 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
 412                                 sys.exit (1)
 413
 414         @staticmethod
 415         def filter_multiple_strings_or_stdin (callback, mnemonic, \
 416                                               separator = " ", \
 417                                               concat_separator = False):
 418
 419                 if "--help" in sys.argv:
 420                         print ("Usage:\n  %s %s...\nor:\n  %s\n\nWhen called with no arguments, input is read from standard input." \
 421                               % (sys.argv[0], mnemonic, sys.argv[0]))
 422                         sys.exit (1)
 423
 424                 try:
 425                         if len (sys.argv) == 1:
 426                                 while (1):
 427                                         line = sys.stdin.readline ()
 428                                         if not len (line):
 429                                                 break
 430                                         if line[-1] == '\n':
 431                                                 line = line[:-1]
 432                                         print (callback (line))
 433                         else:
 434                                 args = sys.argv[1:]
 435                                 if concat_separator != False:
 436                                         args = [concat_separator.join (args)]
 437                                 print (separator.join (callback (x) for x in (args)))
 438                 except IOError as e:
 439                         if e.errno != errno.EPIPE:
 440                                 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
 441                                 sys.exit (1)
 442
 443
 444 class Unicode:
 445
 446         @staticmethod
 447         def decode (s):
 448                 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
 449
 450         @staticmethod
 451         def parse (s):
 452                 s = re.sub (r"0[xX]", " ", s)
 453                 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n    ]", " ", s)
 454                 return [int (x, 16) for x in s.split ()]
 455
 456         @staticmethod
 457         def encode (s):
 458                 s = u''.join (unichr (x) for x in Unicode.parse (s))
 459                 if sys.version_info[0] == 2: s = s.encode ('utf-8')
 460                 return s
 461
 462         shorthands = {
 463                 "ZERO WIDTH NON-JOINER": "ZWNJ",
 464                 "ZERO WIDTH JOINER": "ZWJ",
 465                 "NARROW NO-BREAK SPACE": "NNBSP",
 466                 "COMBINING GRAPHEME JOINER": "CGJ",
 467                 "LEFT-TO-RIGHT MARK": "LRM",
 468                 "RIGHT-TO-LEFT MARK": "RLM",
 469                 "LEFT-TO-RIGHT EMBEDDING": "LRE",
 470                 "RIGHT-TO-LEFT EMBEDDING": "RLE",
 471                 "POP DIRECTIONAL FORMATTING": "PDF",
 472                 "LEFT-TO-RIGHT OVERRIDE": "LRO",
 473                 "RIGHT-TO-LEFT OVERRIDE": "RLO",
 474         }
 475
 476         @staticmethod
 477         def pretty_name (u):
 478                 try:
 479                         s = unicodedata.name (u)
 480                 except ValueError:
 481                         return "XXX"
 482                 s = re.sub (".* LETTER ", "", s)
 483                 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
 484                 s = re.sub (".* SIGN ", "", s)
 485                 s = re.sub (".* COMBINING ", "", s)
 486                 if re.match (".* VIRAMA", s):
 487                         s = "HALANT"
 488                 if s in Unicode.shorthands:
 489                         s = Unicode.shorthands[s]
 490                 return s
 491
 492         @staticmethod
 493         def pretty_names (s):
 494                 s = re.sub (r"[<+>\\uU]", " ", s)
 495                 s = re.sub (r"0[xX]", " ", s)
 496                 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
 497                 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
 498
 499
 500 class FileHelpers:
 501
 502         @staticmethod
 503         def open_file_or_stdin (f):
 504                 if f == '-':
 505                         return sys.stdin
 506                 return file (f)
 507
 508
 509 class Manifest:
 510
 511         @staticmethod
 512         def read (s, strict = True):
 513
 514                 if not os.path.exists (s):
 515                         if strict:
 516                                 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
 517                                 sys.exit (1)
 518                         return
 519
 520                 s = os.path.normpath (s)
 521
 522                 if os.path.isdir (s):
 523
 524                         try:
 525                                 m = file (os.path.join (s, "MANIFEST"))
 526                                 items = [x.strip () for x in m.readlines ()]
 527                                 for f in items:
 528                                         for p in Manifest.read (os.path.join (s, f)):
 529                                                 yield p
 530                         except IOError:
 531                                 if strict:
 532                                         print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
 533                                         sys.exit (1)
 534                                 return
 535                 else:
 536                         yield s
 537
 538         @staticmethod
 539         def update_recursive (s):
 540
 541                 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
 542
 543                         for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
 544                                 if f in dirnames:
 545                                         dirnames.remove (f)
 546                                 if f in filenames:
 547                                         filenames.remove (f)
 548                         dirnames.sort ()
 549                         filenames.sort ()
 550                         ms = os.path.join (dirpath, "MANIFEST")
 551                         print ("  GEN    %s" % ms)
 552                         m = open (ms, "w")
 553                         for f in filenames:
 554                                 print (f, file=m)
 555                         for f in dirnames:
 556                                 print (f, file=m)
 557                         for f in dirnames:
 558                                 Manifest.update_recursive (os.path.join (dirpath, f))
 559
 560 if __name__ == '__main__':
 561         pass