Imported Upstream version 4.8.1
[platform/upstream/gcc48.git] / libgo / go / exp / norm / iter.go
index 761ba90..a954624 100644 (file)
@@ -4,53 +4,96 @@
 
 package norm
 
+import (
+       "fmt"
+       "unicode/utf8"
+)
+
 const MaxSegmentSize = maxByteBufferSize
 
 // An Iter iterates over a string or byte slice, while normalizing it
 // to a given Form.
 type Iter struct {
-       rb   reorderBuffer
-       info runeInfo // first character saved from previous iteration
-       next iterFunc // implementation of next depends on form
-
-       p        int // current position in input source
-       outStart int // start of current segment in output buffer
-       inStart  int // start of current segment in input source
-       maxp     int // position in output buffer after which not to start a new segment
-       maxseg   int // for tracking an excess of combining characters
+       rb     reorderBuffer
+       buf    [maxByteBufferSize]byte
+       info   Properties // first character saved from previous iteration
+       next   iterFunc   // implementation of next depends on form
+       asciiF iterFunc
 
-       tccc uint8
-       done bool
+       p        int    // current position in input source
+       multiSeg []byte // remainder of multi-segment decomposition
 }
 
-type iterFunc func(*Iter, []byte) int
+type iterFunc func(*Iter) []byte
 
-// SetInput initializes i to iterate over src after normalizing it to Form f.
-func (i *Iter) SetInput(f Form, src []byte) {
-       i.rb.init(f, src)
-       if i.rb.f.composing {
-               i.next = nextComposed
-       } else {
-               i.next = nextDecomposed
-       }
+// Init initializes i to iterate over src after normalizing it to Form f.
+func (i *Iter) Init(f Form, src []byte) {
        i.p = 0
-       if i.done = len(src) == 0; !i.done {
-               i.info = i.rb.f.info(i.rb.src, i.p)
+       if len(src) == 0 {
+               i.setDone()
+               i.rb.nsrc = 0
+               return
        }
+       i.multiSeg = nil
+       i.rb.init(f, src)
+       i.next = i.rb.f.nextMain
+       i.asciiF = nextASCIIBytes
+       i.info = i.rb.f.info(i.rb.src, i.p)
 }
 
-// SetInputString initializes i to iterate over src after normalizing it to Form f.
-func (i *Iter) SetInputString(f Form, src string) {
+// InitString initializes i to iterate over src after normalizing it to Form f.
+func (i *Iter) InitString(f Form, src string) {
+       i.p = 0
+       if len(src) == 0 {
+               i.setDone()
+               i.rb.nsrc = 0
+               return
+       }
+       i.multiSeg = nil
        i.rb.initString(f, src)
-       if i.rb.f.composing {
-               i.next = nextComposed
-       } else {
-               i.next = nextDecomposed
+       i.next = i.rb.f.nextMain
+       i.asciiF = nextASCIIString
+       i.info = i.rb.f.info(i.rb.src, i.p)
+}
+
+// Seek sets the segment to be returned by the next call to Next to start
+// at position p.  It is the responsibility of the caller to set p to the
+// start of a UTF8 rune.
+func (i *Iter) Seek(offset int64, whence int) (int64, error) {
+       var abs int64
+       switch whence {
+       case 0:
+               abs = offset
+       case 1:
+               abs = int64(i.p) + offset
+       case 2:
+               abs = int64(i.rb.nsrc) + offset
+       default:
+               return 0, fmt.Errorf("norm: invalid whence")
        }
-       i.p = 0
-       if i.done = len(src) == 0; !i.done {
-               i.info = i.rb.f.info(i.rb.src, i.p)
+       if abs < 0 {
+               return 0, fmt.Errorf("norm: negative position")
+       }
+       if int(abs) >= i.rb.nsrc {
+               i.setDone()
+               return int64(i.p), nil
+       }
+       i.p = int(abs)
+       i.multiSeg = nil
+       i.next = i.rb.f.nextMain
+       i.info = i.rb.f.info(i.rb.src, i.p)
+       return abs, nil
+}
+
+// returnSlice returns a slice of the underlying input type as a byte slice.
+// If the underlying is of type []byte, it will simply return a slice.
+// If the underlying is of type string, it will copy the slice to the buffer
+// and return that.
+func (i *Iter) returnSlice(a, b int) []byte {
+       if i.rb.src.bytes == nil {
+               return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
        }
+       return i.rb.src.bytes[a:b]
 }
 
 // Pos returns the byte position at which the next call to Next will commence processing.
@@ -58,140 +101,232 @@ func (i *Iter) Pos() int {
        return i.p
 }
 
+func (i *Iter) setDone() {
+       i.next = nextDone
+       i.p = i.rb.nsrc
+}
+
 // Done returns true if there is no more input to process.
 func (i *Iter) Done() bool {
-       return i.done
+       return i.p >= i.rb.nsrc
+}
+
+// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
+// For any input a and b for which f(a) == f(b), subsequent calls
+// to Next will return the same segments.
+// Modifying runes are grouped together with the preceding starter, if such a starter exists.
+// Although not guaranteed, n will typically be the smallest possible n.
+func (i *Iter) Next() []byte {
+       return i.next(i)
+}
+
+func nextASCIIBytes(i *Iter) []byte {
+       p := i.p + 1
+       if p >= i.rb.nsrc {
+               i.setDone()
+               return i.rb.src.bytes[i.p:p]
+       }
+       if i.rb.src.bytes[p] < utf8.RuneSelf {
+               p0 := i.p
+               i.p = p
+               return i.rb.src.bytes[p0:p]
+       }
+       i.info = i.rb.f.info(i.rb.src, i.p)
+       i.next = i.rb.f.nextMain
+       return i.next(i)
 }
 
-// Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the
-// largest boundary of i.input such that the result fits in buf.  
-// It returns the number of bytes written to buf.
-// len(buf) should be at least MaxSegmentSize. 
-// Done must be false before calling Next.
-func (i *Iter) Next(buf []byte) int {
-       return i.next(i, buf)
+func nextASCIIString(i *Iter) []byte {
+       p := i.p + 1
+       if p >= i.rb.nsrc {
+               i.buf[0] = i.rb.src.str[i.p]
+               i.setDone()
+               return i.buf[:1]
+       }
+       if i.rb.src.str[p] < utf8.RuneSelf {
+               i.buf[0] = i.rb.src.str[i.p]
+               i.p = p
+               return i.buf[:1]
+       }
+       i.info = i.rb.f.info(i.rb.src, i.p)
+       i.next = i.rb.f.nextMain
+       return i.next(i)
+}
+
+func nextHangul(i *Iter) []byte {
+       if r := i.rb.src.hangul(i.p); r != 0 {
+               i.p += hangulUTF8Size
+               if i.p >= i.rb.nsrc {
+                       i.setDone()
+               }
+               return i.buf[:decomposeHangul(i.buf[:], r)]
+       }
+       i.info = i.rb.f.info(i.rb.src, i.p)
+       i.next = i.rb.f.nextMain
+       return i.next(i)
 }
 
-func (i *Iter) initNext(outn, inStart int) {
-       i.outStart = 0
-       i.inStart = inStart
-       i.maxp = outn - MaxSegmentSize
-       i.maxseg = MaxSegmentSize
+func nextDone(i *Iter) []byte {
+       return nil
 }
 
-// setStart resets the start of the new segment to the given position.
-// It returns true if there is not enough room for the new segment.
-func (i *Iter) setStart(outp, inp int) bool {
-       if outp > i.maxp {
-               return true
+// nextMulti is used for iterating over multi-segment decompositions
+// for decomposing normal forms.
+func nextMulti(i *Iter) []byte {
+       j := 0
+       d := i.multiSeg
+       // skip first rune
+       for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
+       }
+       for j < len(d) {
+               info := i.rb.f.info(input{bytes: d}, j)
+               if info.ccc == 0 {
+                       i.multiSeg = d[j:]
+                       return d[:j]
+               }
+               j += int(info.size)
        }
-       i.outStart = outp
-       i.inStart = inp
-       i.maxseg = outp + MaxSegmentSize
-       return false
+       // treat last segment as normal decomposition
+       i.next = i.rb.f.nextMain
+       return i.next(i)
 }
 
-func min(a, b int) int {
-       if a < b {
-               return a
+// nextMultiNorm is used for iterating over multi-segment decompositions
+// for composing normal forms.
+func nextMultiNorm(i *Iter) []byte {
+       j := 0
+       d := i.multiSeg
+       // skip first rune
+       for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
+       }
+       for j < len(d) {
+               info := i.rb.f.info(input{bytes: d}, j)
+               if info.ccc == 0 {
+                       i.multiSeg = d[j:]
+                       return d[:j]
+               }
+               j += int(info.size)
        }
-       return b
+       i.multiSeg = nil
+       i.next = nextComposed
+       i.p++ // restore old valud of i.p. See nextComposed.
+       if i.p >= i.rb.nsrc {
+               i.setDone()
+       }
+       return d
 }
 
 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
-func nextDecomposed(i *Iter, out []byte) int {
-       var outp int
-       i.initNext(len(out), i.p)
-doFast:
-       inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
+func nextDecomposed(i *Iter) (next []byte) {
+       startp, outp := i.p, 0
+       inCopyStart, outCopyStart := i.p, 0
        for {
                if sz := int(i.info.size); sz <= 1 {
-                       // ASCII or illegal byte.  Either way, advance by 1.
-                       i.p++
+                       p := i.p
+                       i.p++ // ASCII or illegal byte.  Either way, advance by 1.
+                       if i.p >= i.rb.nsrc {
+                               i.setDone()
+                               return i.returnSlice(p, i.p)
+                       } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
+                               i.next = i.asciiF
+                               return i.returnSlice(p, i.p)
+                       }
                        outp++
-                       max := min(i.rb.nsrc, len(out)-outp+i.p)
-                       if np := i.rb.src.skipASCII(i.p, max); np > i.p {
-                               outp += np - i.p
-                               i.p = np
-                               if i.p >= i.rb.nsrc {
-                                       break
+               } else if d := i.info.Decomposition(); d != nil {
+                       // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
+                       // Case 1: there is a leftover to copy.  In this case the decomposition
+                       // must begin with a modifier and should always be appended.
+                       // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
+                       p := outp + len(d)
+                       if outp > 0 {
+                               i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
+                               if p > len(i.buf) {
+                                       return i.buf[:outp]
                                }
-                               // ASCII may combine with consecutive runes.
-                               if i.setStart(outp-1, i.p-1) {
-                                       i.p--
-                                       outp--
-                                       i.info.size = 1
-                                       break
+                       } else if i.info.multiSegment() {
+                               // outp must be 0 as multi-segment decompositions always
+                               // start a new segment.
+                               if i.multiSeg == nil {
+                                       i.multiSeg = d
+                                       i.next = nextMulti
+                                       return nextMulti(i)
                                }
+                               // We are in the last segment.  Treat as normal decomposition.
+                               d = i.multiSeg
+                               i.multiSeg = nil
+                               p = len(d)
                        }
-               } else if d := i.info.decomposition(); d != nil {
-                       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
-                       p := outp + len(d)
-                       if p > i.maxseg && i.setStart(outp, i.p) {
-                               return outp
+                       prevCC := i.info.tccc
+                       if i.p += sz; i.p >= i.rb.nsrc {
+                               i.setDone()
+                               i.info = Properties{} // Force BoundaryBefore to succeed.
+                       } else {
+                               i.info = i.rb.f.info(i.rb.src, i.p)
+                       }
+                       if i.info.BoundaryBefore() {
+                               if outp > 0 {
+                                       copy(i.buf[outp:], d)
+                                       return i.buf[:p]
+                               }
+                               return d
                        }
-                       copy(out[outp:], d)
+                       copy(i.buf[outp:], d)
                        outp = p
-                       i.p += sz
                        inCopyStart, outCopyStart = i.p, outp
+                       if i.info.ccc < prevCC {
+                               goto doNorm
+                       }
+                       continue
                } else if r := i.rb.src.hangul(i.p); r != 0 {
-                       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
-                       for {
-                               outp += decomposeHangul(out[outp:], r)
-                               i.p += hangulUTF8Size
-                               if r = i.rb.src.hangul(i.p); r == 0 {
-                                       break
-                               }
-                               if i.setStart(outp, i.p) {
-                                       return outp
-                               }
+                       i.next = nextHangul
+                       i.p += hangulUTF8Size
+                       if i.p >= i.rb.nsrc {
+                               i.setDone()
                        }
-                       inCopyStart, outCopyStart = i.p, outp
+                       return i.buf[:decomposeHangul(i.buf[:], r)]
                } else {
                        p := outp + sz
-                       if p > i.maxseg && i.setStart(outp, i.p) {
+                       if p > len(i.buf) {
                                break
                        }
                        outp = p
                        i.p += sz
                }
                if i.p >= i.rb.nsrc {
+                       i.setDone()
                        break
                }
                prevCC := i.info.tccc
                i.info = i.rb.f.info(i.rb.src, i.p)
-               if cc := i.info.ccc; cc == 0 {
-                       if i.setStart(outp, i.p) {
-                               break
-                       }
-               } else if cc < prevCC {
+               if i.info.BoundaryBefore() {
+                       break
+               } else if i.info.ccc < prevCC {
                        goto doNorm
                }
        }
-       if inCopyStart != i.p {
-               i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+       if outCopyStart == 0 {
+               return i.returnSlice(inCopyStart, i.p)
+       } else if inCopyStart < i.p {
+               i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
        }
-       i.done = i.p >= i.rb.nsrc
-       return outp
+       return i.buf[:outp]
 doNorm:
        // Insert what we have decomposed so far in the reorderBuffer.
        // As we will only reorder, there will always be enough room.
-       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
-       if !i.rb.insertDecomposed(out[i.outStart:outp]) {
+       i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
+       if !i.rb.insertDecomposed(i.buf[0:outp]) {
                // Start over to prevent decompositions from crossing segment boundaries.
-               // This is a rare occurance.
-               i.p = i.inStart
+               // This is a rare occurrence.
+               i.p = startp
                i.info = i.rb.f.info(i.rb.src, i.p)
        }
-       outp = i.outStart
        for {
                if !i.rb.insert(i.rb.src, i.p, i.info) {
                        break
                }
                if i.p += int(i.info.size); i.p >= i.rb.nsrc {
-                       outp += i.rb.flushCopy(out[outp:])
-                       i.done = true
-                       return outp
+                       i.setDone()
+                       break
                }
                i.info = i.rb.f.info(i.rb.src, i.p)
                if i.info.ccc == 0 {
@@ -199,27 +334,19 @@ doNorm:
                }
        }
        // new segment or too many combining characters: exit normalization
-       if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
-               return outp
-       }
-       goto doFast
+       return i.buf[:i.rb.flushCopy(i.buf[:])]
 }
 
 // nextComposed is the implementation of Next for forms NFC and NFKC.
-func nextComposed(i *Iter, out []byte) int {
-       var outp int
-       i.initNext(len(out), i.p)
-doFast:
-       inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
+func nextComposed(i *Iter) []byte {
+       outp, startp := 0, i.p
        var prevCC uint8
        for {
                if !i.info.isYesC() {
                        goto doNorm
                }
-               if cc := i.info.ccc; cc == 0 {
-                       if i.setStart(outp, i.p) {
-                               break
-                       }
+               if cc := i.info.ccc; cc == 0 && outp > 0 {
+                       break
                } else if cc < prevCC {
                        goto doNorm
                }
@@ -229,58 +356,46 @@ doFast:
                        sz = 1 // illegal rune: copy byte-by-byte
                }
                p := outp + sz
-               if p > i.maxseg && i.setStart(outp, i.p) {
+               if p > len(i.buf) {
                        break
                }
                outp = p
                i.p += sz
-               max := min(i.rb.nsrc, len(out)-outp+i.p)
-               if np := i.rb.src.skipASCII(i.p, max); np > i.p {
-                       outp += np - i.p
-                       i.p = np
-                       if i.p >= i.rb.nsrc {
-                               break
-                       }
-                       // ASCII may combine with consecutive runes.
-                       if i.setStart(outp-1, i.p-1) {
-                               i.p--
-                               outp--
-                               i.info = runeInfo{size: 1}
-                               break
-                       }
-               }
                if i.p >= i.rb.nsrc {
+                       i.setDone()
+                       break
+               } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
+                       i.next = i.asciiF
                        break
                }
                i.info = i.rb.f.info(i.rb.src, i.p)
        }
-       if inCopyStart != i.p {
-               i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
-       }
-       i.done = i.p >= i.rb.nsrc
-       return outp
+       return i.returnSlice(startp, i.p)
 doNorm:
-       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart)
-       outp, i.p = i.outStart, i.inStart
+       multi := false
+       i.p = startp
        i.info = i.rb.f.info(i.rb.src, i.p)
        for {
                if !i.rb.insert(i.rb.src, i.p, i.info) {
                        break
                }
+               multi = multi || i.info.multiSegment()
                if i.p += int(i.info.size); i.p >= i.rb.nsrc {
-                       i.rb.compose()
-                       outp += i.rb.flushCopy(out[outp:])
-                       i.done = true
-                       return outp
+                       i.setDone()
+                       break
                }
                i.info = i.rb.f.info(i.rb.src, i.p)
-               if i.info.boundaryBefore() {
+               if i.info.BoundaryBefore() {
                        break
                }
        }
        i.rb.compose()
-       if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
-               return outp
+       seg := i.buf[:i.rb.flushCopy(i.buf[:])]
+       if multi {
+               i.p-- // fake not being done yet
+               i.multiSeg = seg
+               i.next = nextMultiNorm
+               return nextMultiNorm(i)
        }
-       goto doFast
+       return seg
 }