3 import sys, os, re, difflib, unicodedata, errno, cgi
4 from itertools import *
6 diff_symbols = "-+=*&^%$#@!~/"
7 diff_colors = ['red', 'green', 'blue']
13 def start_color (c): return ''
15 def end_color (): return ''
17 def escape (s): return s
19 def newline (): return '\n'
25 'red': '\033[41;37;1m',
26 'green': '\033[42;37;1m',
27 'blue': '\033[44;37;1m',
33 def escape (s): return s
35 def newline (): return '\n'
40 return '<span style="background:%s">' % c
45 def escape (s): return cgi.escape (s)
47 def newline (): return '<br/>\n'
50 def Auto (argv = [], out = sys.stdout):
51 format = ColorFormatter.ANSI
52 if "--format" in argv:
53 argv.remove ("--format")
54 format = ColorFormatter.ANSI
55 if "--format=ansi" in argv:
56 argv.remove ("--format=ansi")
57 format = ColorFormatter.ANSI
58 if "--format=html" in argv:
59 argv.remove ("--format=html")
60 format = ColorFormatter.HTML
61 if "--no-format" in argv:
62 argv.remove ("--no-format")
63 format = ColorFormatter.Null
69 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
71 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
72 self.formatter = formatter
74 self.symbols = symbols
76 def colorize_lines (self, lines):
77 lines = (l if l else '' for l in lines)
78 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
81 for l in difflib.Differ().compare (*ss):
87 oo[i] += self.formatter.end_color ()
89 oo = [o + self.formatter.escape (l[2:]) for o in oo]
91 if l[0] in self.symbols:
92 i = self.symbols.index (l[0])
94 oo[i] += self.formatter.start_color (self.colors[i])
96 oo[i] += self.formatter.escape (l[2:])
100 oo[i] += self.formatter.end_color ()
102 oo = [o.replace ('\n', '') for o in oo]
103 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
105 def colorize_diff (self, f):
108 if l[0] not in self.symbols:
109 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
111 i = self.symbols.index (l[0])
114 for line in self.colorize_lines (lines):
120 for line in self.colorize_lines (lines):
125 for line in self.colorize_lines (lines):
132 def diff_files (files, symbols=diff_symbols):
133 files = tuple (files) # in case it's a generator, copy it
135 for lines in izip_longest (*files):
136 if all (lines[0] == line for line in lines[1:]):
137 sys.stdout.writelines ([" ", lines[0]])
140 for i, l in enumerate (lines):
142 sys.stdout.writelines ([symbols[i], l])
144 if e.errno != errno.EPIPE:
145 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
152 def filter_failures (f):
153 for key, lines in DiffHelpers.separate_test_cases (f):
155 if not DiffHelpers.test_passed (lines):
156 for l in lines: yield l
164 def add (self, test):
166 self.freq += test.freq
171 self.passed = Stat ()
172 self.failed = Stat ()
175 def add (self, test):
176 self.total.add (test)
178 self.passed.add (test)
180 self.failed.add (test)
183 return float (self.passed.count) / self.total.count
186 return (float (self.passed.count) / self.total.count) * \
187 (float (self.failed.count) / self.total.count)
190 return self.variance () ** .5
192 def zscore (self, population):
193 """Calculate the standard score.
194 Population is the Stats for population.
195 Self is Stats for sample.
196 Returns larger absolute value if sample is highly unlikely to be random.
197 Anything outside of -3..+3 is very unlikely to be random.
198 See: http://en.wikipedia.org/wiki/Standard_score"""
200 return (self.mean () - population.mean ()) / population.stddev ()
211 # XXX port to Stats, but that would really slow us down here
212 for key, lines in DiffHelpers.separate_test_cases (f):
213 if DiffHelpers.test_passed (lines):
217 total = passed + failed
218 print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
221 def print_ngrams (f, ns=(1,2,3)):
222 gens = tuple (Ngram.generator (n) for n in ns)
225 for key, lines in DiffHelpers.separate_test_cases (f):
230 for ngram in gen (test.unicodes):
231 if ngram not in allgrams:
232 allgrams[ngram] = Stats ()
233 allgrams[ngram].add (test)
236 for ngram, stats in allgrams.iteritems ():
237 if stats.failed.count >= 30: # for statistical reasons
238 importantgrams[ngram] = stats
239 allgrams = importantgrams
242 for ngram, stats in allgrams.iteritems ():
243 print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
249 def __init__ (self, lines):
252 self.identifier = None
263 if not self.identifier:
264 self.identifier = l[1:i]
265 i = i + 2 # Skip colon and space
269 brackets = l[i] + l[j]
273 elif brackets == '<>':
274 self.unicodes = Unicode.parse (l)
275 elif brackets == '[]':
276 # XXX we don't handle failed tests here
283 def separate_test_cases (f):
284 '''Reads lines from f, and if the lines have identifiers, ie.
285 have a colon character, groups them by identifier,
286 yielding lists of all lines with the same identifier.'''
290 return l[1:l.index (':')]
292 return groupby (f, key=identifier)
295 def test_passed (lines):
297 # XXX This is a hack, but does the job for now.
298 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
299 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
300 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
301 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
302 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
303 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
304 return all (l[0] == ' ' for l in lines)
310 def filter_printer_function (filter_callback):
312 for line in filter_callback (f):
317 def filter_printer_function_no_newline (filter_callback):
319 for line in filter_callback (f):
320 sys.stdout.writelines ([line])
344 def process_multiple_files (callback, mnemonic = "FILE"):
346 if "--help" in sys.argv:
347 print "Usage: %s %s..." % (sys.argv[0], mnemonic)
351 files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
353 callback (FileHelpers.open_file_or_stdin (s))
355 if e.errno != errno.EPIPE:
356 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
360 def process_multiple_args (callback, mnemonic):
362 if len (sys.argv) == 1 or "--help" in sys.argv:
363 print "Usage: %s %s..." % (sys.argv[0], mnemonic)
367 for s in sys.argv[1:]:
370 if e.errno != errno.EPIPE:
371 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
375 def filter_multiple_strings_or_stdin (callback, mnemonic, \
377 concat_separator = False):
379 if "--help" in sys.argv:
380 print "Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
381 % (sys.argv[0], mnemonic, sys.argv[0])
385 if len (sys.argv) == 1:
387 line = sys.stdin.readline ()
392 print callback (line)
395 if concat_separator != False:
396 args = [concat_separator.join (args)]
397 print separator.join (callback (x) for x in (args))
399 if e.errno != errno.EPIPE:
400 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
408 return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
412 s = re.sub (r"0[xX]", " ", s)
413 s = re.sub (r"[<+>,;&#\\xXuU\n ]", " ", s)
414 return [int (x, 16) for x in s.split (' ') if len (x)]
418 return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
421 "ZERO WIDTH NON-JOINER": "ZWNJ",
422 "ZERO WIDTH JOINER": "ZWJ",
423 "NARROW NO-BREAK SPACE": "NNBSP",
424 "COMBINING GRAPHEME JOINER": "CGJ",
425 "LEFT-TO-RIGHT MARK": "LRM",
426 "RIGHT-TO-LEFT MARK": "RLM",
427 "LEFT-TO-RIGHT EMBEDDING": "LRE",
428 "RIGHT-TO-LEFT EMBEDDING": "RLE",
429 "POP DIRECTIONAL FORMATTING": "PDF",
430 "LEFT-TO-RIGHT OVERRIDE": "LRO",
431 "RIGHT-TO-LEFT OVERRIDE": "RLO",
437 s = unicodedata.name (u)
440 s = re.sub (".* LETTER ", "", s)
441 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
442 s = re.sub (".* SIGN ", "", s)
443 s = re.sub (".* COMBINING ", "", s)
444 if re.match (".* VIRAMA", s):
446 if s in Unicode.shorthands:
447 s = Unicode.shorthands[s]
451 def pretty_names (s):
452 s = re.sub (r"[<+>\\uU]", " ", s)
453 s = re.sub (r"0[xX]", " ", s)
454 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
455 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
461 def open_file_or_stdin (f):
470 def read (s, strict = True):
472 if not os.path.exists (s):
474 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
478 s = os.path.normpath (s)
480 if os.path.isdir (s):
483 m = file (os.path.join (s, "MANIFEST"))
484 items = [x.strip () for x in m.readlines ()]
486 for p in Manifest.read (os.path.join (s, f)):
490 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
497 def update_recursive (s):
499 for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
501 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
508 ms = os.path.join (dirpath, "MANIFEST")
516 Manifest.update_recursive (os.path.join (dirpath, f))
518 if __name__ == '__main__':