1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // Trie table generator.
8 // Used by make*tables tools to generate a go file with trie data structures
9 // for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
10 // sequence are used to lookup offsets in the index table to be used for the
11 // next byte. The last byte is used to index into a table with 16-bit values.
23 const maxSparseEntries = 16
25 // Intermediate trie structure
26 type trieNode struct {
33 func newNode() *trieNode {
37 func (n trieNode) String() string {
38 s := fmt.Sprint("trieNode{table: { non-nil at index: ")
39 for i, v := range n.table {
41 s += fmt.Sprintf("%d, ", i)
44 s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf)
48 func (n trieNode) isInternal() bool {
50 for i := 0; i < 256; i++ {
51 if nn := n.table[i]; nn != nil {
52 if !internal && !nn.leaf {
53 log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n)
55 internal = internal && !nn.leaf
61 func (n trieNode) mostFrequentStride() int {
62 counts := make(map[int]int)
64 for _, t := range n.table[0x80 : 0x80+blockSize] {
66 if stride := t.value - v; v != 0 && stride >= 0 {
75 for stride, cnt := range counts {
76 if cnt > maxc || (cnt == maxc && stride < maxs) {
77 maxs, maxc = stride, cnt
83 func (n trieNode) countSparseEntries() int {
84 stride := n.mostFrequentStride()
86 for _, t := range n.table[0x80 : 0x80+blockSize] {
101 func (n *trieNode) insert(r rune, value uint16) {
102 var p [utf8.UTFMax]byte
103 sz := utf8.EncodeRune(p[:], r)
105 for i := 0; i < sz; i++ {
107 log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n)
121 type nodeIndex struct {
122 lookupBlocks []*trieNode
123 valueBlocks []*trieNode
124 sparseBlocks []*trieNode
125 sparseOffset []uint16
128 lookupBlockIdx map[uint32]int
129 valueBlockIdx map[uint32]int
132 func newIndex() *nodeIndex {
133 index := &nodeIndex{}
134 index.lookupBlocks = make([]*trieNode, 0)
135 index.valueBlocks = make([]*trieNode, 0)
136 index.sparseBlocks = make([]*trieNode, 0)
137 index.sparseOffset = make([]uint16, 1)
138 index.lookupBlockIdx = make(map[uint32]int)
139 index.valueBlockIdx = make(map[uint32]int)
143 func computeOffsets(index *nodeIndex, n *trieNode) int {
147 hasher := crc32.New(crc32.MakeTable(crc32.IEEE))
148 // We only index continuation bytes.
149 for i := 0; i < blockSize; i++ {
151 if nn := n.table[0x80+i]; nn != nil {
152 v = computeOffsets(index, nn)
154 hasher.Write([]byte{uint8(v >> 8), uint8(v)})
158 v, ok := index.lookupBlockIdx[h]
160 v = len(index.lookupBlocks)
161 index.lookupBlocks = append(index.lookupBlocks, n)
162 index.lookupBlockIdx[h] = v
166 v, ok := index.valueBlockIdx[h]
168 if c := n.countSparseEntries(); c > maxSparseEntries {
169 v = len(index.valueBlocks)
170 index.valueBlocks = append(index.valueBlocks, n)
171 index.valueBlockIdx[h] = v
173 v = -len(index.sparseOffset)
174 index.sparseBlocks = append(index.sparseBlocks, n)
175 index.sparseOffset = append(index.sparseOffset, uint16(index.sparseCount))
176 index.sparseCount += c + 1
177 index.valueBlockIdx[h] = v
185 func printValueBlock(nr int, n *trieNode, offset int) {
186 boff := nr * blockSize
187 fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
188 var printnewline bool
189 for i := 0; i < blockSize; i++ {
194 if nn := n.table[i+offset]; nn != nil {
202 fmt.Printf("%#04x:%#04x, ", boff+i, v)
207 func printSparseBlock(nr int, n *trieNode) {
209 fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
212 stride := n.mostFrequentStride()
213 c := n.countSparseEntries()
214 fmt.Printf("\n{value:%#04x,lo:%#02x},", stride, uint8(c))
215 for i, nn := range n.table[0x80 : 0x80+blockSize] {
222 fmt.Printf(",hi:%#02x},", 0x80+i-1)
225 fmt.Printf("\n{value:%#04x,lo:%#02x", nv, nn.b)
231 fmt.Printf(",hi:%#02x},", 0x80+blockSize-1)
235 func printLookupBlock(nr int, n *trieNode, offset, cutoff int) {
236 boff := nr * blockSize
237 fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
238 var printnewline bool
239 for i := 0; i < blockSize; i++ {
244 if nn := n.table[i+offset]; nn != nil {
255 fmt.Printf("%#03x:%#02x, ", boff+i, v)
260 // printTables returns the size in bytes of the generated tables.
261 func (t *trieNode) printTables(name string) int {
263 // Values for 7-bit ASCII are stored in first two block, followed by nil block.
264 index.valueBlocks = append(index.valueBlocks, nil, nil, nil)
265 // First byte of multi-byte UTF-8 codepoints are indexed in 4th block.
266 index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil)
267 // Index starter bytes of multi-byte UTF-8.
268 for i := 0xC0; i < 0x100; i++ {
269 if t.table[i] != nil {
270 computeOffsets(index, t.table[i])
274 nv := len(index.valueBlocks) * blockSize
275 fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2)
276 fmt.Printf("// Block 2 is the null block.\n")
277 fmt.Printf("var %sValues = [%d]uint16 {", name, nv)
278 printValueBlock(0, t, 0)
279 printValueBlock(1, t, 64)
280 printValueBlock(2, newNode(), 0)
281 for i := 3; i < len(index.valueBlocks); i++ {
282 printValueBlock(i, index.valueBlocks[i], 0x80)
286 ls := len(index.sparseBlocks)
287 fmt.Printf("// %sSparseOffset: %d entries, %d bytes\n", name, ls, ls*2)
288 fmt.Printf("var %sSparseOffset = %#v\n\n", name, index.sparseOffset[1:])
290 ns := index.sparseCount
291 fmt.Printf("// %sSparseValues: %d entries, %d bytes\n", name, ns, ns*4)
292 fmt.Printf("var %sSparseValues = [%d]valueRange {", name, ns)
293 for i, n := range index.sparseBlocks {
294 printSparseBlock(i, n)
298 cutoff := len(index.valueBlocks)
299 ni := len(index.lookupBlocks) * blockSize
300 fmt.Printf("// %sLookup: %d bytes\n", name, ni)
301 fmt.Printf("// Block 0 is the null block.\n")
302 fmt.Printf("var %sLookup = [%d]uint8 {", name, ni)
303 printLookupBlock(0, newNode(), 0, cutoff)
304 printLookupBlock(1, newNode(), 0, cutoff)
305 printLookupBlock(2, newNode(), 0, cutoff)
306 printLookupBlock(3, t, 0xC0, cutoff)
307 for i := 4; i < len(index.lookupBlocks); i++ {
308 printLookupBlock(i, index.lookupBlocks[i], 0x80, cutoff)
311 fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:], %sSparseValues[:], %sSparseOffset[:], %d}\n\n",
312 name, name, name, name, name, cutoff)
313 return nv*2 + ns*4 + ni + ls*2