diff_symbols = "-+=*&^%$#@!~/"
diff_colors = ['red', 'green', 'blue']
-if sys.version_info[0] >= 3:
+def codepoints(s):
+ return (ord (u) for u in s)
+
+try:
+ unichr = unichr
+
+ if sys.maxunicode < 0x10FFFF:
+ # workarounds for Python 2 "narrow" builds with UCS2-only support.
+
+ _narrow_unichr = unichr
+
+ def unichr(i):
+ """
+ Return the unicode character whose Unicode code is the integer 'i'.
+ The valid range is 0 to 0x10FFFF inclusive.
+
+ >>> _narrow_unichr(0xFFFF + 1)
+ Traceback (most recent call last):
+ File "<stdin>", line 1, in ?
+ ValueError: unichr() arg not in range(0x10000) (narrow Python build)
+ >>> unichr(0xFFFF + 1) == u'\U00010000'
+ True
+ >>> unichr(1114111) == u'\U0010FFFF'
+ True
+ >>> unichr(0x10FFFF + 1)
+ Traceback (most recent call last):
+ File "<stdin>", line 1, in ?
+ ValueError: unichr() arg not in range(0x110000)
+ """
+ try:
+ return _narrow_unichr(i)
+ except ValueError:
+ try:
+ padded_hex_str = hex(i)[2:].zfill(8)
+ escape_str = "\\U" + padded_hex_str
+ return escape_str.decode("unicode-escape")
+ except UnicodeDecodeError:
+ raise ValueError('unichr() arg not in range(0x110000)')
+
+ def codepoints(s):
+ high_surrogate = None
+ for u in s:
+ cp = ord (u)
+ if 0xDC00 <= cp <= 0xDFFF:
+ if high_surrogate:
+ yield 0x10000 + (high_surrogate - 0xD800) * 0x400 + (cp - 0xDC00)
+ high_surrogate = None
+ else:
+ yield 0xFFFC
+ else:
+ if high_surrogate:
+ yield 0xFFFC
+ high_surrogate = None
+ if 0xD800 <= cp <= 0xDBFF:
+ high_surrogate = cp
+ else:
+ yield cp
+ high_surrogate = None
+ if high_surrogate:
+ yield 0xFFFC
+
+except NameError:
unichr = chr
+try:
+ unicode = unicode
+except NameError:
+ unicode = str
+
+def tounicode(s, encoding='ascii', errors='strict'):
+ if not isinstance(s, unicode):
+ return s.decode(encoding, errors)
+ else:
+ return s
+
class ColorFormatter:
class Null:
total = passed + failed
print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
- @staticmethod
- def print_ngrams (f, ns=(1,2,3)):
- gens = tuple (Ngram.generator (n) for n in ns)
- allstats = Stats ()
- allgrams = {}
- for key, lines in DiffHelpers.separate_test_cases (f):
- test = Test (lines)
- allstats.add (test)
-
- for gen in gens:
- for ngram in gen (test.unicodes):
- if ngram not in allgrams:
- allgrams[ngram] = Stats ()
- allgrams[ngram].add (test)
-
- importantgrams = {}
- for ngram, stats in allgrams.iteritems ():
- if stats.failed.count >= 30: # for statistical reasons
- importantgrams[ngram] = stats
- allgrams = importantgrams
- del importantgrams
-
- for ngram, stats in allgrams.iteritems ():
- print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
-
-
class Test:
@staticmethod
def decode (s):
- return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
+ return u','.join ("U+%04X" % cp for cp in codepoints (tounicode (s, 'utf-8')))
@staticmethod
def parse (s):
s = re.sub (r"0[xX]", " ", s)
- s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s)
+ s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n\t]", " ", s)
return [int (x, 16) for x in s.split ()]
@staticmethod