3 from __future__ import print_function
4 import sys, os, re, difflib, unicodedata, errno, cgi
5 from itertools import *
7 diff_symbols = "-+=*&^%$#@!~/"
8 diff_colors = ['red', 'green', 'blue']
13 if sys.maxunicode < 0x10FFFF:
14 # workarounds for Python 2 "narrow" builds with UCS2-only support.
16 _narrow_unichr = unichr
20 Return the unicode character whose Unicode code is the integer 'i'.
21 The valid range is 0 to 0x10FFFF inclusive.
23 >>> _narrow_unichr(0xFFFF + 1)
24 Traceback (most recent call last):
25 File "<stdin>", line 1, in ?
26 ValueError: unichr() arg not in range(0x10000) (narrow Python build)
27 >>> unichr(0xFFFF + 1) == u'\U00010000'
29 >>> unichr(1114111) == u'\U0010FFFF'
31 >>> unichr(0x10FFFF + 1)
32 Traceback (most recent call last):
33 File "<stdin>", line 1, in ?
34 ValueError: unichr() arg not in range(0x110000)
37 return _narrow_unichr(i)
40 padded_hex_str = hex(i)[2:].zfill(8)
41 escape_str = "\\U" + padded_hex_str
42 return escape_str.decode("unicode-escape")
43 except UnicodeDecodeError:
44 raise ValueError('unichr() arg not in range(0x110000)')
54 def tounicode(s, encoding='ascii', errors='strict'):
55 if not isinstance(s, unicode):
56 return s.decode(encoding, errors)
64 def start_color (c): return ''
66 def end_color (): return ''
68 def escape (s): return s
70 def newline (): return '\n'
76 'red': '\033[41;37;1m',
77 'green': '\033[42;37;1m',
78 'blue': '\033[44;37;1m',
84 def escape (s): return s
86 def newline (): return '\n'
91 return '<span style="background:%s">' % c
96 def escape (s): return cgi.escape (s)
98 def newline (): return '<br/>\n'
101 def Auto (argv = [], out = sys.stdout):
102 format = ColorFormatter.ANSI
103 if "--format" in argv:
104 argv.remove ("--format")
105 format = ColorFormatter.ANSI
106 if "--format=ansi" in argv:
107 argv.remove ("--format=ansi")
108 format = ColorFormatter.ANSI
109 if "--format=html" in argv:
110 argv.remove ("--format=html")
111 format = ColorFormatter.HTML
112 if "--no-format" in argv:
113 argv.remove ("--no-format")
114 format = ColorFormatter.Null
120 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
122 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
123 self.formatter = formatter
125 self.symbols = symbols
127 def colorize_lines (self, lines):
128 lines = (l if l else '' for l in lines)
129 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
132 for l in difflib.Differ().compare (*ss):
138 oo[i] += self.formatter.end_color ()
140 oo = [o + self.formatter.escape (l[2:]) for o in oo]
142 if l[0] in self.symbols:
143 i = self.symbols.index (l[0])
145 oo[i] += self.formatter.start_color (self.colors[i])
147 oo[i] += self.formatter.escape (l[2:])
151 oo[i] += self.formatter.end_color ()
153 oo = [o.replace ('\n', '') for o in oo]
154 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
156 def colorize_diff (self, f):
159 if l[0] not in self.symbols:
160 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
162 i = self.symbols.index (l[0])
165 for line in self.colorize_lines (lines):
171 for line in self.colorize_lines (lines):
176 for line in self.colorize_lines (lines):
183 def diff_files (files, symbols=diff_symbols):
184 files = tuple (files) # in case it's a generator, copy it
186 for lines in izip_longest (*files):
187 if all (lines[0] == line for line in lines[1:]):
188 sys.stdout.writelines ([" ", lines[0]])
191 for i, l in enumerate (lines):
193 sys.stdout.writelines ([symbols[i], l])
195 if e.errno != errno.EPIPE:
196 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
203 def filter_failures (f):
204 for key, lines in DiffHelpers.separate_test_cases (f):
206 if not DiffHelpers.test_passed (lines):
207 for l in lines: yield l
215 def add (self, test):
217 self.freq += test.freq
222 self.passed = Stat ()
223 self.failed = Stat ()
226 def add (self, test):
227 self.total.add (test)
229 self.passed.add (test)
231 self.failed.add (test)
234 return float (self.passed.count) / self.total.count
237 return (float (self.passed.count) / self.total.count) * \
238 (float (self.failed.count) / self.total.count)
241 return self.variance () ** .5
243 def zscore (self, population):
244 """Calculate the standard score.
245 Population is the Stats for population.
246 Self is Stats for sample.
247 Returns larger absolute value if sample is highly unlikely to be random.
248 Anything outside of -3..+3 is very unlikely to be random.
249 See: http://en.wikipedia.org/wiki/Standard_score"""
251 return (self.mean () - population.mean ()) / population.stddev ()
262 # XXX port to Stats, but that would really slow us down here
263 for key, lines in DiffHelpers.separate_test_cases (f):
264 if DiffHelpers.test_passed (lines):
268 total = passed + failed
269 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
272 def print_ngrams (f, ns=(1,2,3)):
273 gens = tuple (Ngram.generator (n) for n in ns)
276 for key, lines in DiffHelpers.separate_test_cases (f):
281 for ngram in gen (test.unicodes):
282 if ngram not in allgrams:
283 allgrams[ngram] = Stats ()
284 allgrams[ngram].add (test)
287 for ngram, stats in allgrams.iteritems ():
288 if stats.failed.count >= 30: # for statistical reasons
289 importantgrams[ngram] = stats
290 allgrams = importantgrams
293 for ngram, stats in allgrams.iteritems ():
294 print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
300 def __init__ (self, lines):
303 self.identifier = None
314 if not self.identifier:
315 self.identifier = l[1:i]
316 i = i + 2 # Skip colon and space
320 brackets = l[i] + l[j]
324 elif brackets == '<>':
325 self.unicodes = Unicode.parse (l)
326 elif brackets == '[]':
327 # XXX we don't handle failed tests here
334 def separate_test_cases (f):
335 '''Reads lines from f, and if the lines have identifiers, ie.
336 have a colon character, groups them by identifier,
337 yielding lists of all lines with the same identifier.'''
341 return l[1:l.index (':')]
343 return groupby (f, key=identifier)
346 def test_passed (lines):
348 # XXX This is a hack, but does the job for now.
349 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
350 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
351 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
352 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
353 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
354 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
355 return all (l[0] == ' ' for l in lines)
361 def filter_printer_function (filter_callback):
363 for line in filter_callback (f):
368 def filter_printer_function_no_newline (filter_callback):
370 for line in filter_callback (f):
371 sys.stdout.writelines ([line])
395 def process_multiple_files (callback, mnemonic = "FILE"):
397 if "--help" in sys.argv:
398 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
402 files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
404 callback (FileHelpers.open_file_or_stdin (s))
406 if e.errno != errno.EPIPE:
407 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
411 def process_multiple_args (callback, mnemonic):
413 if len (sys.argv) == 1 or "--help" in sys.argv:
414 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
418 for s in sys.argv[1:]:
421 if e.errno != errno.EPIPE:
422 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
426 def filter_multiple_strings_or_stdin (callback, mnemonic, \
428 concat_separator = False):
430 if "--help" in sys.argv:
431 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
432 % (sys.argv[0], mnemonic, sys.argv[0]))
436 if len (sys.argv) == 1:
438 line = sys.stdin.readline ()
443 print (callback (line))
446 if concat_separator != False:
447 args = [concat_separator.join (args)]
448 print (separator.join (callback (x) for x in (args)))
450 if e.errno != errno.EPIPE:
451 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
459 return u','.join ("U+%04X" % ord (u) for u in tounicode (s, 'utf-8'))
463 s = re.sub (r"0[xX]", " ", s)
464 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s)
465 return [int (x, 16) for x in s.split ()]
469 s = u''.join (unichr (x) for x in Unicode.parse (s))
470 if sys.version_info[0] == 2: s = s.encode ('utf-8')
474 "ZERO WIDTH NON-JOINER": "ZWNJ",
475 "ZERO WIDTH JOINER": "ZWJ",
476 "NARROW NO-BREAK SPACE": "NNBSP",
477 "COMBINING GRAPHEME JOINER": "CGJ",
478 "LEFT-TO-RIGHT MARK": "LRM",
479 "RIGHT-TO-LEFT MARK": "RLM",
480 "LEFT-TO-RIGHT EMBEDDING": "LRE",
481 "RIGHT-TO-LEFT EMBEDDING": "RLE",
482 "POP DIRECTIONAL FORMATTING": "PDF",
483 "LEFT-TO-RIGHT OVERRIDE": "LRO",
484 "RIGHT-TO-LEFT OVERRIDE": "RLO",
490 s = unicodedata.name (u)
493 s = re.sub (".* LETTER ", "", s)
494 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
495 s = re.sub (".* SIGN ", "", s)
496 s = re.sub (".* COMBINING ", "", s)
497 if re.match (".* VIRAMA", s):
499 if s in Unicode.shorthands:
500 s = Unicode.shorthands[s]
504 def pretty_names (s):
505 s = re.sub (r"[<+>\\uU]", " ", s)
506 s = re.sub (r"0[xX]", " ", s)
507 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
508 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
514 def open_file_or_stdin (f):
523 def read (s, strict = True):
525 if not os.path.exists (s):
527 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
531 s = os.path.normpath (s)
533 if os.path.isdir (s):
536 m = file (os.path.join (s, "MANIFEST"))
537 items = [x.strip () for x in m.readlines ()]
539 for p in Manifest.read (os.path.join (s, f)):
543 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
550 def update_recursive (s):
552 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
554 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
561 ms = os.path.join (dirpath, "MANIFEST")
562 print (" GEN %s" % ms)
569 Manifest.update_recursive (os.path.join (dirpath, f))
571 if __name__ == '__main__':