// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ignore // Trie table generator. // Used by make*tables tools to generate a go file with trie data structures // for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte // sequence are used to lookup offsets in the index table to be used for the // next byte. The last byte is used to index into a table with 16-bit values. package main import ( "fmt" "hash/crc32" "log" "unicode/utf8" ) const ( blockSize = 64 blockOffset = 2 // Subtract two blocks to compensate for the 0x80 added to continuation bytes. maxSparseEntries = 16 ) // Intermediate trie structure type trieNode struct { table [256]*trieNode value int b byte leaf bool } func newNode() *trieNode { return new(trieNode) } func (n trieNode) String() string { s := fmt.Sprint("trieNode{table: { non-nil at index: ") for i, v := range n.table { if v != nil { s += fmt.Sprintf("%d, ", i) } } s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf) return s } func (n trieNode) isInternal() bool { internal := true for i := 0; i < 256; i++ { if nn := n.table[i]; nn != nil { if !internal && !nn.leaf { log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n) } internal = internal && !nn.leaf } } return internal } func (n trieNode) mostFrequentStride() int { counts := make(map[int]int) v := 0 for _, t := range n.table[0x80 : 0x80+blockSize] { if t != nil { if stride := t.value - v; v != 0 && stride >= 0 { counts[stride]++ } v = t.value } else { v = 0 } } var maxs, maxc int for stride, cnt := range counts { if cnt > maxc || (cnt == maxc && stride < maxs) { maxs, maxc = stride, cnt } } return maxs } func (n trieNode) countSparseEntries() int { stride := n.mostFrequentStride() var count, v int for _, t := range n.table[0x80 : 0x80+blockSize] { tv := 0 if t != nil { tv = t.value } if tv-v != stride { if tv != 0 { count++ } } v = tv } return count } func (n *trieNode) insert(r rune, value uint16) { var p [utf8.UTFMax]byte sz := utf8.EncodeRune(p[:], r) for i := 0; i < sz; i++ { if n.leaf { log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n) } nn := n.table[p[i]] if nn == nil { nn = newNode() nn.b = p[i] n.table[p[i]] = nn } n = nn } n.value = int(value) n.leaf = true } type nodeIndex struct { lookupBlocks []*trieNode valueBlocks []*trieNode sparseBlocks []*trieNode sparseOffset []uint16 sparseCount int lookupBlockIdx map[uint32]int valueBlockIdx map[uint32]int } func newIndex() *nodeIndex { index := &nodeIndex{} index.lookupBlocks = make([]*trieNode, 0) index.valueBlocks = make([]*trieNode, 0) index.sparseBlocks = make([]*trieNode, 0) index.sparseOffset = make([]uint16, 1) index.lookupBlockIdx = make(map[uint32]int) index.valueBlockIdx = make(map[uint32]int) return index } func computeOffsets(index *nodeIndex, n *trieNode) int { if n.leaf { return n.value } hasher := crc32.New(crc32.MakeTable(crc32.IEEE)) // We only index continuation bytes. for i := 0; i < blockSize; i++ { v := 0 if nn := n.table[0x80+i]; nn != nil { v = computeOffsets(index, nn) } hasher.Write([]byte{uint8(v >> 8), uint8(v)}) } h := hasher.Sum32() if n.isInternal() { v, ok := index.lookupBlockIdx[h] if !ok { v = len(index.lookupBlocks) - blockOffset index.lookupBlocks = append(index.lookupBlocks, n) index.lookupBlockIdx[h] = v } n.value = v } else { v, ok := index.valueBlockIdx[h] if !ok { if c := n.countSparseEntries(); c > maxSparseEntries { v = len(index.valueBlocks) - blockOffset index.valueBlocks = append(index.valueBlocks, n) index.valueBlockIdx[h] = v } else { v = -len(index.sparseOffset) index.sparseBlocks = append(index.sparseBlocks, n) index.sparseOffset = append(index.sparseOffset, uint16(index.sparseCount)) index.sparseCount += c + 1 index.valueBlockIdx[h] = v } } n.value = v } return n.value } func printValueBlock(nr int, n *trieNode, offset int) { boff := nr * blockSize fmt.Printf("\n// Block %#x, offset %#x", nr, boff) var printnewline bool for i := 0; i < blockSize; i++ { if i%6 == 0 { printnewline = true } v := 0 if nn := n.table[i+offset]; nn != nil { v = nn.value } if v != 0 { if printnewline { fmt.Printf("\n") printnewline = false } fmt.Printf("%#04x:%#04x, ", boff+i, v) } } } func printSparseBlock(nr int, n *trieNode) { boff := -n.value fmt.Printf("\n// Block %#x, offset %#x", nr, boff) v := 0 //stride := f(n) stride := n.mostFrequentStride() c := n.countSparseEntries() fmt.Printf("\n{value:%#04x,lo:%#02x},", stride, uint8(c)) for i, nn := range n.table[0x80 : 0x80+blockSize] { nv := 0 if nn != nil { nv = nn.value } if nv-v != stride { if v != 0 { fmt.Printf(",hi:%#02x},", 0x80+i-1) } if nv != 0 { fmt.Printf("\n{value:%#04x,lo:%#02x", nv, nn.b) } } v = nv } if v != 0 { fmt.Printf(",hi:%#02x},", 0x80+blockSize-1) } } func printLookupBlock(nr int, n *trieNode, offset, cutoff int) { boff := nr * blockSize fmt.Printf("\n// Block %#x, offset %#x", nr, boff) var printnewline bool for i := 0; i < blockSize; i++ { if i%8 == 0 { printnewline = true } v := 0 if nn := n.table[i+offset]; nn != nil { v = nn.value } if v != 0 { if v < 0 { v = -v - 1 + cutoff } if printnewline { fmt.Printf("\n") printnewline = false } fmt.Printf("%#03x:%#02x, ", boff+i, v) } } } // printTables returns the size in bytes of the generated tables. func (t *trieNode) printTables(name string) int { index := newIndex() // Values for 7-bit ASCII are stored in first two block, followed by nil block. index.valueBlocks = append(index.valueBlocks, nil, nil, nil) // First byte of multi-byte UTF-8 codepoints are indexed in 4th block. index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil) // Index starter bytes of multi-byte UTF-8. for i := 0xC0; i < 0x100; i++ { if t.table[i] != nil { computeOffsets(index, t.table[i]) } } nv := len(index.valueBlocks) * blockSize fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2) fmt.Printf("// Block 2 is the null block.\n") fmt.Printf("var %sValues = [%d]uint16 {", name, nv) printValueBlock(0, t, 0) printValueBlock(1, t, 64) printValueBlock(2, newNode(), 0) for i := 3; i < len(index.valueBlocks); i++ { printValueBlock(i, index.valueBlocks[i], 0x80) } fmt.Print("\n}\n\n") ls := len(index.sparseBlocks) fmt.Printf("// %sSparseOffset: %d entries, %d bytes\n", name, ls, ls*2) fmt.Printf("var %sSparseOffset = %#v\n\n", name, index.sparseOffset[1:]) ns := index.sparseCount fmt.Printf("// %sSparseValues: %d entries, %d bytes\n", name, ns, ns*4) fmt.Printf("var %sSparseValues = [%d]valueRange {", name, ns) for i, n := range index.sparseBlocks { printSparseBlock(i, n) } fmt.Print("\n}\n\n") cutoff := len(index.valueBlocks) - blockOffset ni := len(index.lookupBlocks) * blockSize fmt.Printf("// %sLookup: %d bytes\n", name, ni) fmt.Printf("// Block 0 is the null block.\n") fmt.Printf("var %sLookup = [%d]uint8 {", name, ni) printLookupBlock(0, newNode(), 0, cutoff) printLookupBlock(1, newNode(), 0, cutoff) printLookupBlock(2, newNode(), 0, cutoff) printLookupBlock(3, t, 0xC0, cutoff) for i := 4; i < len(index.lookupBlocks); i++ { printLookupBlock(i, index.lookupBlocks[i], 0x80, cutoff) } fmt.Print("\n}\n\n") fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:], %sSparseValues[:], %sSparseOffset[:], %d}\n\n", name, name, name, name, name, cutoff) return nv*2 + ns*4 + ni + ls*2 }