3 import sys, os, re, difflib, unicodedata, errno, cgi
4 from itertools import *
6 diff_symbols = "-+=*&^%$#@!~/"
7 diff_colors = ['red', 'green', 'blue']
13 def start_color (c): return ''
15 def end_color (): return ''
17 def escape (s): return s
19 def newline (): return '\n'
25 'red': '\033[41;37;1m',
26 'green': '\033[42;37;1m',
27 'blue': '\033[44;37;1m',
33 def escape (s): return s
35 def newline (): return '\n'
40 return '<span style="background:%s">' % c
45 def escape (s): return cgi.escape (s)
47 def newline (): return '<br/>\n'
50 def Auto (argv = [], out = sys.stdout):
51 format = ColorFormatter.ANSI
52 if "--format" in argv:
53 argv.remove ("--format")
54 format = ColorFormatter.ANSI
55 if "--format=ansi" in argv:
56 argv.remove ("--format=ansi")
57 format = ColorFormatter.ANSI
58 if "--format=html" in argv:
59 argv.remove ("--format=html")
60 format = ColorFormatter.HTML
61 if "--no-format" in argv:
62 argv.remove ("--no-format")
63 format = ColorFormatter.Null
69 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
71 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
72 self.formatter = formatter
74 self.symbols = symbols
76 def colorize_lines (self, lines):
77 lines = (l if l else '' for l in lines)
78 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
81 for l in difflib.Differ().compare (*ss):
87 oo[i] += self.formatter.end_color ()
89 oo = [o + self.formatter.escape (l[2:]) for o in oo]
91 if l[0] in self.symbols:
92 i = self.symbols.index (l[0])
94 oo[i] += self.formatter.start_color (self.colors[i])
96 oo[i] += self.formatter.escape (l[2:])
100 oo[i] += self.formatter.end_color ()
102 oo = [o.replace ('\n', '') for o in oo]
103 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
105 def colorize_diff (self, f):
108 if l[0] not in self.symbols:
109 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
111 i = self.symbols.index (l[0])
114 for line in self.colorize_lines (lines):
120 for line in self.colorize_lines (lines):
125 for line in self.colorize_lines (lines):
132 def diff_files (files, symbols=diff_symbols):
133 files = tuple (files) # in case it's a generator, copy it
135 for lines in izip_longest (*files):
136 if all (lines[0] == line for line in lines[1:]):
137 sys.stdout.writelines ([" ", lines[0]])
140 for i, l in enumerate (lines):
142 sys.stdout.writelines ([symbols[i], l])
144 if e.errno != errno.EPIPE:
145 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
152 def filter_failures (f):
153 for key, lines in DiffHelpers.separate_test_cases (f):
155 if not DiffHelpers.test_passed (lines):
156 for l in lines: yield l
164 def add (self, test):
166 self.freq += test.freq
171 self.passed = Stat ()
172 self.failed = Stat ()
175 def add (self, test):
176 self.total.add (test)
178 self.passed.add (test)
180 self.failed.add (test)
183 return float (self.passed.count) / self.total.count
186 return (float (self.passed.count) / self.total.count) * \
187 (float (self.failed.count) / self.total.count)
190 return self.variance () ** .5
192 def zscore (self, population):
193 """Calculate the standard score.
194 Population is the Stats for population.
195 Self is Stats for sample.
196 Returns larger absolute value if sample is highly unlikely to be random.
197 Anything outside of -3..+3 is very unlikely to be random.
198 See: http://en.wikipedia.org/wiki/Standard_score"""
200 return (self.mean () - population.mean ()) / population.stddev ()
211 # XXX port to Stats, but that would really slow us down here
212 for key, lines in DiffHelpers.separate_test_cases (f):
213 if DiffHelpers.test_passed (lines):
217 total = passed + failed
218 print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
221 def print_ngrams (f, ns=(1,2,3)):
222 gens = tuple (Ngram.generator (n) for n in ns)
225 for key, lines in DiffHelpers.separate_test_cases (f):
230 for ngram in gen (test.unicodes):
231 if ngram not in allgrams:
232 allgrams[ngram] = Stats ()
233 allgrams[ngram].add (test)
236 for ngram, stats in allgrams.iteritems ():
237 if stats.failed.count >= 30: # for statistical reasons
238 importantgrams[ngram] = stats
239 allgrams = importantgrams
242 for ngram, stats in allgrams.iteritems ():
243 print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
249 def __init__ (self, lines):
252 self.identifier = None
263 if not self.identifier:
264 self.identifier = l[1:i]
265 i = i + 2 # Skip colon and space
269 brackets = l[i] + l[j]
273 elif brackets == '<>':
274 self.unicodes = Unicode.parse (l)
275 elif brackets == '[]':
276 # XXX we don't handle failed tests here
283 def separate_test_cases (f):
284 '''Reads lines from f, and if the lines have identifiers, ie.
285 have a colon character, groups them by identifier,
286 yielding lists of all lines with the same identifier.'''
290 return l[1:l.index (':')]
292 return groupby (f, key=identifier)
295 def test_passed (lines):
297 # XXX This is a hack, but does the job for now.
298 if any (l.find("space|space") >= 0 for l in lines): return True
299 if any (l.find("uni25CC") >= 0 for l in lines): return True
300 if any (l.find("dottedcircle") >= 0 for l in lines): return True
301 return all (l[0] == ' ' for l in lines)
307 def filter_printer_function (filter_callback):
309 for line in filter_callback (f):
314 def filter_printer_function_no_newline (filter_callback):
316 for line in filter_callback (f):
317 sys.stdout.writelines ([line])
341 def process_multiple_files (callback, mnemonic = "FILE"):
343 if "--help" in sys.argv:
344 print "Usage: %s %s..." % (sys.argv[0], mnemonic)
348 files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
350 callback (FileHelpers.open_file_or_stdin (s))
352 if e.errno != errno.EPIPE:
353 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
357 def process_multiple_args (callback, mnemonic):
359 if len (sys.argv) == 1 or "--help" in sys.argv:
360 print "Usage: %s %s..." % (sys.argv[0], mnemonic)
364 for s in sys.argv[1:]:
367 if e.errno != errno.EPIPE:
368 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
372 def filter_multiple_strings_or_stdin (callback, mnemonic, \
374 concat_separator = False):
376 if "--help" in sys.argv:
377 print "Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
378 % (sys.argv[0], mnemonic, sys.argv[0])
382 if len (sys.argv) == 1:
384 line = sys.stdin.readline ()
389 print callback (line)
392 if concat_separator != False:
393 args = [concat_separator.join (args)]
394 print separator.join (callback (x) for x in (args))
396 if e.errno != errno.EPIPE:
397 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
405 return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
409 s = re.sub (r"[<+>,\\uU\n ]", " ", s)
410 s = re.sub (r"0[xX]", " ", s)
411 return [int (x, 16) for x in s.split (' ') if len (x)]
415 return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
418 "ZERO WIDTH NON-JOINER": "ZWNJ",
419 "ZERO WIDTH JOINER": "ZWJ",
420 "NARROW NO-BREAK SPACE": "NNBSP",
421 "COMBINING GRAPHEME JOINER": "CGJ",
422 "LEFT-TO-RIGHT MARK": "LRM",
423 "RIGHT-TO-LEFT MARK": "RLM",
424 "LEFT-TO-RIGHT EMBEDDING": "LRE",
425 "RIGHT-TO-LEFT EMBEDDING": "RLE",
426 "POP DIRECTIONAL FORMATTING": "PDF",
427 "LEFT-TO-RIGHT OVERRIDE": "LRO",
428 "RIGHT-TO-LEFT OVERRIDE": "RLO",
434 s = unicodedata.name (u)
437 s = re.sub (".* LETTER ", "", s)
438 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
439 s = re.sub (".* SIGN ", "", s)
440 s = re.sub (".* COMBINING ", "", s)
441 if re.match (".* VIRAMA", s):
443 if s in Unicode.shorthands:
444 s = Unicode.shorthands[s]
448 def pretty_names (s):
449 s = re.sub (r"[<+>\\uU]", " ", s)
450 s = re.sub (r"0[xX]", " ", s)
451 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
452 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
458 def open_file_or_stdin (f):
467 def read (s, strict = True):
469 if not os.path.exists (s):
471 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
475 s = os.path.normpath (s)
477 if os.path.isdir (s):
480 m = file (os.path.join (s, "MANIFEST"))
481 items = [x.strip () for x in m.readlines ()]
483 for p in Manifest.read (os.path.join (s, f)):
487 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
494 def update_recursive (s):
496 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
498 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
505 ms = os.path.join (dirpath, "MANIFEST")
513 Manifest.update_recursive (os.path.join (dirpath, f))
515 if __name__ == '__main__':