2 # Copyright 2008 The RE2 Authors. All Rights Reserved.
3 # Use of this source code is governed by a BSD-style
4 # license that can be found in the LICENSE file.
6 """Generate C++ tables for Unicode Script and Category groups."""
12 // GENERATED BY make_unicode_groups.py; DO NOT EDIT.
13 // make_unicode_groups.py >unicode_groups.cc
15 #include "re2/unicode_groups.h"
30 def MakeRanges(codes):
31 """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
42 def PrintRanges(type, name, ranges):
43 """Print the ranges as an array of type named name."""
44 print "static %s %s[] = {" % (type, name,)
46 print "\t{ %d, %d }," % (lo, hi)
49 # def PrintCodes(type, name, codes):
50 # """Print the codes as an array of type named name."""
51 # print "static %s %s[] = {" % (type, name,)
53 # print "\t%d," % (c,)
56 def PrintGroup(name, codes):
57 """Print the data structures for the group of codes.
58 Return a UGroup literal for the group."""
60 # See unicode_groups.h for a description of the data structure.
62 # Split codes into 16-bit ranges and 32-bit ranges.
63 range16 = MakeRanges([c for c in codes if c < 65536])
64 range32 = MakeRanges([c for c in codes if c >= 65536])
66 # Pull singleton ranges out of range16.
67 # code16 = [lo for lo, hi in range16 if lo == hi]
68 # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
75 ugroup = "{ \"%s\", +1" % (name,)
77 # PrintCodes("uint16", name+"_code16", code16)
78 # ugroup += ", %s_code16, %d" % (name, len(code16))
82 PrintRanges("URange16", name+"_range16", range16)
83 ugroup += ", %s_range16, %d" % (name, len(range16))
87 PrintRanges("URange32", name+"_range32", range32)
88 ugroup += ", %s_range32, %d" % (name, len(range32))
97 for name, codes in unicode.Categories().iteritems():
98 ugroups.append(PrintGroup(name, codes))
99 for name, codes in unicode.Scripts().iteritems():
100 ugroups.append(PrintGroup(name, codes))
101 print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
102 print "UGroup unicode_groups[] = {";
105 print "\t%s," % (ug,)
107 print "int num_unicode_groups = %d;" % (len(ugroups),)
110 if __name__ == '__main__':