3 from __future__ import print_function
4 import sys, os, re, difflib, unicodedata, errno, cgi
5 from itertools import *
7 diff_symbols = "-+=*&^%$#@!~/"
8 diff_colors = ['red', 'green', 'blue']
13 if sys.maxunicode < 0x10FFFF:
14 # workarounds for Python 2 "narrow" builds with UCS2-only support.
16 _narrow_unichr = unichr
20 Return the unicode character whose Unicode code is the integer 'i'.
21 The valid range is 0 to 0x10FFFF inclusive.
23 >>> _narrow_unichr(0xFFFF + 1)
24 Traceback (most recent call last):
25 File "<stdin>", line 1, in ?
26 ValueError: unichr() arg not in range(0x10000) (narrow Python build)
27 >>> unichr(0xFFFF + 1) == u'\U00010000'
29 >>> unichr(1114111) == u'\U0010FFFF'
31 >>> unichr(0x10FFFF + 1)
32 Traceback (most recent call last):
33 File "<stdin>", line 1, in ?
34 ValueError: unichr() arg not in range(0x110000)
37 return _narrow_unichr(i)
40 padded_hex_str = hex(i)[2:].zfill(8)
41 escape_str = "\\U" + padded_hex_str
42 return escape_str.decode("unicode-escape")
43 except UnicodeDecodeError:
44 raise ValueError('unichr() arg not in range(0x110000)')
53 def start_color (c): return ''
55 def end_color (): return ''
57 def escape (s): return s
59 def newline (): return '\n'
65 'red': '\033[41;37;1m',
66 'green': '\033[42;37;1m',
67 'blue': '\033[44;37;1m',
73 def escape (s): return s
75 def newline (): return '\n'
80 return '<span style="background:%s">' % c
85 def escape (s): return cgi.escape (s)
87 def newline (): return '<br/>\n'
90 def Auto (argv = [], out = sys.stdout):
91 format = ColorFormatter.ANSI
92 if "--format" in argv:
93 argv.remove ("--format")
94 format = ColorFormatter.ANSI
95 if "--format=ansi" in argv:
96 argv.remove ("--format=ansi")
97 format = ColorFormatter.ANSI
98 if "--format=html" in argv:
99 argv.remove ("--format=html")
100 format = ColorFormatter.HTML
101 if "--no-format" in argv:
102 argv.remove ("--no-format")
103 format = ColorFormatter.Null
109 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
111 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
112 self.formatter = formatter
114 self.symbols = symbols
116 def colorize_lines (self, lines):
117 lines = (l if l else '' for l in lines)
118 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
121 for l in difflib.Differ().compare (*ss):
127 oo[i] += self.formatter.end_color ()
129 oo = [o + self.formatter.escape (l[2:]) for o in oo]
131 if l[0] in self.symbols:
132 i = self.symbols.index (l[0])
134 oo[i] += self.formatter.start_color (self.colors[i])
136 oo[i] += self.formatter.escape (l[2:])
140 oo[i] += self.formatter.end_color ()
142 oo = [o.replace ('\n', '') for o in oo]
143 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
145 def colorize_diff (self, f):
148 if l[0] not in self.symbols:
149 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
151 i = self.symbols.index (l[0])
154 for line in self.colorize_lines (lines):
160 for line in self.colorize_lines (lines):
165 for line in self.colorize_lines (lines):
172 def diff_files (files, symbols=diff_symbols):
173 files = tuple (files) # in case it's a generator, copy it
175 for lines in izip_longest (*files):
176 if all (lines[0] == line for line in lines[1:]):
177 sys.stdout.writelines ([" ", lines[0]])
180 for i, l in enumerate (lines):
182 sys.stdout.writelines ([symbols[i], l])
184 if e.errno != errno.EPIPE:
185 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
192 def filter_failures (f):
193 for key, lines in DiffHelpers.separate_test_cases (f):
195 if not DiffHelpers.test_passed (lines):
196 for l in lines: yield l
204 def add (self, test):
206 self.freq += test.freq
211 self.passed = Stat ()
212 self.failed = Stat ()
215 def add (self, test):
216 self.total.add (test)
218 self.passed.add (test)
220 self.failed.add (test)
223 return float (self.passed.count) / self.total.count
226 return (float (self.passed.count) / self.total.count) * \
227 (float (self.failed.count) / self.total.count)
230 return self.variance () ** .5
232 def zscore (self, population):
233 """Calculate the standard score.
234 Population is the Stats for population.
235 Self is Stats for sample.
236 Returns larger absolute value if sample is highly unlikely to be random.
237 Anything outside of -3..+3 is very unlikely to be random.
238 See: http://en.wikipedia.org/wiki/Standard_score"""
240 return (self.mean () - population.mean ()) / population.stddev ()
251 # XXX port to Stats, but that would really slow us down here
252 for key, lines in DiffHelpers.separate_test_cases (f):
253 if DiffHelpers.test_passed (lines):
257 total = passed + failed
258 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
261 def print_ngrams (f, ns=(1,2,3)):
262 gens = tuple (Ngram.generator (n) for n in ns)
265 for key, lines in DiffHelpers.separate_test_cases (f):
270 for ngram in gen (test.unicodes):
271 if ngram not in allgrams:
272 allgrams[ngram] = Stats ()
273 allgrams[ngram].add (test)
276 for ngram, stats in allgrams.iteritems ():
277 if stats.failed.count >= 30: # for statistical reasons
278 importantgrams[ngram] = stats
279 allgrams = importantgrams
282 for ngram, stats in allgrams.iteritems ():
283 print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
289 def __init__ (self, lines):
292 self.identifier = None
303 if not self.identifier:
304 self.identifier = l[1:i]
305 i = i + 2 # Skip colon and space
309 brackets = l[i] + l[j]
313 elif brackets == '<>':
314 self.unicodes = Unicode.parse (l)
315 elif brackets == '[]':
316 # XXX we don't handle failed tests here
323 def separate_test_cases (f):
324 '''Reads lines from f, and if the lines have identifiers, ie.
325 have a colon character, groups them by identifier,
326 yielding lists of all lines with the same identifier.'''
330 return l[1:l.index (':')]
332 return groupby (f, key=identifier)
335 def test_passed (lines):
337 # XXX This is a hack, but does the job for now.
338 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
339 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
340 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
341 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
342 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
343 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
344 return all (l[0] == ' ' for l in lines)
350 def filter_printer_function (filter_callback):
352 for line in filter_callback (f):
357 def filter_printer_function_no_newline (filter_callback):
359 for line in filter_callback (f):
360 sys.stdout.writelines ([line])
384 def process_multiple_files (callback, mnemonic = "FILE"):
386 if "--help" in sys.argv:
387 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
391 files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
393 callback (FileHelpers.open_file_or_stdin (s))
395 if e.errno != errno.EPIPE:
396 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
400 def process_multiple_args (callback, mnemonic):
402 if len (sys.argv) == 1 or "--help" in sys.argv:
403 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
407 for s in sys.argv[1:]:
410 if e.errno != errno.EPIPE:
411 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
415 def filter_multiple_strings_or_stdin (callback, mnemonic, \
417 concat_separator = False):
419 if "--help" in sys.argv:
420 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
421 % (sys.argv[0], mnemonic, sys.argv[0]))
425 if len (sys.argv) == 1:
427 line = sys.stdin.readline ()
432 print (callback (line))
435 if concat_separator != False:
436 args = [concat_separator.join (args)]
437 print (separator.join (callback (x) for x in (args)))
439 if e.errno != errno.EPIPE:
440 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
448 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
452 s = re.sub (r"0[xX]", " ", s)
453 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s)
454 return [int (x, 16) for x in s.split ()]
458 s = u''.join (unichr (x) for x in Unicode.parse (s))
459 if sys.version_info[0] == 2: s = s.encode ('utf-8')
463 "ZERO WIDTH NON-JOINER": "ZWNJ",
464 "ZERO WIDTH JOINER": "ZWJ",
465 "NARROW NO-BREAK SPACE": "NNBSP",
466 "COMBINING GRAPHEME JOINER": "CGJ",
467 "LEFT-TO-RIGHT MARK": "LRM",
468 "RIGHT-TO-LEFT MARK": "RLM",
469 "LEFT-TO-RIGHT EMBEDDING": "LRE",
470 "RIGHT-TO-LEFT EMBEDDING": "RLE",
471 "POP DIRECTIONAL FORMATTING": "PDF",
472 "LEFT-TO-RIGHT OVERRIDE": "LRO",
473 "RIGHT-TO-LEFT OVERRIDE": "RLO",
479 s = unicodedata.name (u)
482 s = re.sub (".* LETTER ", "", s)
483 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
484 s = re.sub (".* SIGN ", "", s)
485 s = re.sub (".* COMBINING ", "", s)
486 if re.match (".* VIRAMA", s):
488 if s in Unicode.shorthands:
489 s = Unicode.shorthands[s]
493 def pretty_names (s):
494 s = re.sub (r"[<+>\\uU]", " ", s)
495 s = re.sub (r"0[xX]", " ", s)
496 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
497 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
503 def open_file_or_stdin (f):
512 def read (s, strict = True):
514 if not os.path.exists (s):
516 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
520 s = os.path.normpath (s)
522 if os.path.isdir (s):
525 m = file (os.path.join (s, "MANIFEST"))
526 items = [x.strip () for x in m.readlines ()]
528 for p in Manifest.read (os.path.join (s, f)):
532 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
539 def update_recursive (s):
541 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
543 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
550 ms = os.path.join (dirpath, "MANIFEST")
551 print (" GEN %s" % ms)
558 Manifest.update_recursive (os.path.join (dirpath, f))
560 if __name__ == '__main__':