3 from __future__ import print_function
4 import sys, os, re, difflib, unicodedata, errno, cgi
5 from itertools import *
7 diff_symbols = "-+=*&^%$#@!~/"
8 diff_colors = ['red', 'green', 'blue']
10 if sys.version_info[0] >= 3:
17 def start_color (c): return ''
19 def end_color (): return ''
21 def escape (s): return s
23 def newline (): return '\n'
29 'red': '\033[41;37;1m',
30 'green': '\033[42;37;1m',
31 'blue': '\033[44;37;1m',
37 def escape (s): return s
39 def newline (): return '\n'
44 return '<span style="background:%s">' % c
49 def escape (s): return cgi.escape (s)
51 def newline (): return '<br/>\n'
54 def Auto (argv = [], out = sys.stdout):
55 format = ColorFormatter.ANSI
56 if "--format" in argv:
57 argv.remove ("--format")
58 format = ColorFormatter.ANSI
59 if "--format=ansi" in argv:
60 argv.remove ("--format=ansi")
61 format = ColorFormatter.ANSI
62 if "--format=html" in argv:
63 argv.remove ("--format=html")
64 format = ColorFormatter.HTML
65 if "--no-format" in argv:
66 argv.remove ("--no-format")
67 format = ColorFormatter.Null
73 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
75 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
76 self.formatter = formatter
78 self.symbols = symbols
80 def colorize_lines (self, lines):
81 lines = (l if l else '' for l in lines)
82 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
85 for l in difflib.Differ().compare (*ss):
91 oo[i] += self.formatter.end_color ()
93 oo = [o + self.formatter.escape (l[2:]) for o in oo]
95 if l[0] in self.symbols:
96 i = self.symbols.index (l[0])
98 oo[i] += self.formatter.start_color (self.colors[i])
100 oo[i] += self.formatter.escape (l[2:])
104 oo[i] += self.formatter.end_color ()
106 oo = [o.replace ('\n', '') for o in oo]
107 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
109 def colorize_diff (self, f):
112 if l[0] not in self.symbols:
113 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
115 i = self.symbols.index (l[0])
118 for line in self.colorize_lines (lines):
124 for line in self.colorize_lines (lines):
129 for line in self.colorize_lines (lines):
136 def diff_files (files, symbols=diff_symbols):
137 files = tuple (files) # in case it's a generator, copy it
139 for lines in izip_longest (*files):
140 if all (lines[0] == line for line in lines[1:]):
141 sys.stdout.writelines ([" ", lines[0]])
144 for i, l in enumerate (lines):
146 sys.stdout.writelines ([symbols[i], l])
148 if e.errno != errno.EPIPE:
149 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
156 def filter_failures (f):
157 for key, lines in DiffHelpers.separate_test_cases (f):
159 if not DiffHelpers.test_passed (lines):
160 for l in lines: yield l
168 def add (self, test):
170 self.freq += test.freq
175 self.passed = Stat ()
176 self.failed = Stat ()
179 def add (self, test):
180 self.total.add (test)
182 self.passed.add (test)
184 self.failed.add (test)
187 return float (self.passed.count) / self.total.count
190 return (float (self.passed.count) / self.total.count) * \
191 (float (self.failed.count) / self.total.count)
194 return self.variance () ** .5
196 def zscore (self, population):
197 """Calculate the standard score.
198 Population is the Stats for population.
199 Self is Stats for sample.
200 Returns larger absolute value if sample is highly unlikely to be random.
201 Anything outside of -3..+3 is very unlikely to be random.
202 See: http://en.wikipedia.org/wiki/Standard_score"""
204 return (self.mean () - population.mean ()) / population.stddev ()
215 # XXX port to Stats, but that would really slow us down here
216 for key, lines in DiffHelpers.separate_test_cases (f):
217 if DiffHelpers.test_passed (lines):
221 total = passed + failed
222 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
225 def print_ngrams (f, ns=(1,2,3)):
226 gens = tuple (Ngram.generator (n) for n in ns)
229 for key, lines in DiffHelpers.separate_test_cases (f):
234 for ngram in gen (test.unicodes):
235 if ngram not in allgrams:
236 allgrams[ngram] = Stats ()
237 allgrams[ngram].add (test)
240 for ngram, stats in allgrams.iteritems ():
241 if stats.failed.count >= 30: # for statistical reasons
242 importantgrams[ngram] = stats
243 allgrams = importantgrams
246 for ngram, stats in allgrams.iteritems ():
247 print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
253 def __init__ (self, lines):
256 self.identifier = None
267 if not self.identifier:
268 self.identifier = l[1:i]
269 i = i + 2 # Skip colon and space
273 brackets = l[i] + l[j]
277 elif brackets == '<>':
278 self.unicodes = Unicode.parse (l)
279 elif brackets == '[]':
280 # XXX we don't handle failed tests here
287 def separate_test_cases (f):
288 '''Reads lines from f, and if the lines have identifiers, ie.
289 have a colon character, groups them by identifier,
290 yielding lists of all lines with the same identifier.'''
294 return l[1:l.index (':')]
296 return groupby (f, key=identifier)
299 def test_passed (lines):
301 # XXX This is a hack, but does the job for now.
302 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
303 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
304 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
305 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
306 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
307 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
308 return all (l[0] == ' ' for l in lines)
314 def filter_printer_function (filter_callback):
316 for line in filter_callback (f):
321 def filter_printer_function_no_newline (filter_callback):
323 for line in filter_callback (f):
324 sys.stdout.writelines ([line])
348 def process_multiple_files (callback, mnemonic = "FILE"):
350 if "--help" in sys.argv:
351 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
355 files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
357 callback (FileHelpers.open_file_or_stdin (s))
359 if e.errno != errno.EPIPE:
360 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
364 def process_multiple_args (callback, mnemonic):
366 if len (sys.argv) == 1 or "--help" in sys.argv:
367 print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
371 for s in sys.argv[1:]:
374 if e.errno != errno.EPIPE:
375 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
379 def filter_multiple_strings_or_stdin (callback, mnemonic, \
381 concat_separator = False):
383 if "--help" in sys.argv:
384 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
385 % (sys.argv[0], mnemonic, sys.argv[0]))
389 if len (sys.argv) == 1:
391 line = sys.stdin.readline ()
396 print (callback (line))
399 if concat_separator != False:
400 args = [concat_separator.join (args)]
401 print (separator.join (callback (x) for x in (args)))
403 if e.errno != errno.EPIPE:
404 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
412 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
416 s = re.sub (r"0[xX]", " ", s)
417 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s)
418 return [int (x, 16) for x in s.split ()]
422 s = u''.join (unichr (x) for x in Unicode.parse (s))
423 if sys.version_info[0] == 2: s = s.encode ('utf-8')
427 "ZERO WIDTH NON-JOINER": "ZWNJ",
428 "ZERO WIDTH JOINER": "ZWJ",
429 "NARROW NO-BREAK SPACE": "NNBSP",
430 "COMBINING GRAPHEME JOINER": "CGJ",
431 "LEFT-TO-RIGHT MARK": "LRM",
432 "RIGHT-TO-LEFT MARK": "RLM",
433 "LEFT-TO-RIGHT EMBEDDING": "LRE",
434 "RIGHT-TO-LEFT EMBEDDING": "RLE",
435 "POP DIRECTIONAL FORMATTING": "PDF",
436 "LEFT-TO-RIGHT OVERRIDE": "LRO",
437 "RIGHT-TO-LEFT OVERRIDE": "RLO",
443 s = unicodedata.name (u)
446 s = re.sub (".* LETTER ", "", s)
447 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
448 s = re.sub (".* SIGN ", "", s)
449 s = re.sub (".* COMBINING ", "", s)
450 if re.match (".* VIRAMA", s):
452 if s in Unicode.shorthands:
453 s = Unicode.shorthands[s]
457 def pretty_names (s):
458 s = re.sub (r"[<+>\\uU]", " ", s)
459 s = re.sub (r"0[xX]", " ", s)
460 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
461 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
467 def open_file_or_stdin (f):
476 def read (s, strict = True):
478 if not os.path.exists (s):
480 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
484 s = os.path.normpath (s)
486 if os.path.isdir (s):
489 m = file (os.path.join (s, "MANIFEST"))
490 items = [x.strip () for x in m.readlines ()]
492 for p in Manifest.read (os.path.join (s, f)):
496 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
503 def update_recursive (s):
505 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
507 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
514 ms = os.path.join (dirpath, "MANIFEST")
515 print (" GEN %s" % ms)
522 Manifest.update_recursive (os.path.join (dirpath, f))
524 if __name__ == '__main__':