1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 // The worker functions in this file was optimized for performance. If you make changes
6 // you should use care to consider all of the interesting cases.
8 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
9 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
10 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
11 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
13 // This define can be used to turn off the fast loops. Useful for finding whether
14 // the problem is fastloop-specific.
18 using System.Runtime.Serialization;
19 using System.Diagnostics;
20 using System.Diagnostics.Contracts;
21 using System.Globalization;
25 // Encodes text into and out of UTF-8. UTF-8 is a way of writing
26 // Unicode characters with variable numbers of bytes per character,
27 // optimized for the lower 127 ASCII characters. It's an efficient way
28 // of encoding US English in an internationalizable way.
30 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
32 // The UTF-8 byte order mark is simply the Unicode byte order mark
33 // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
34 // used mostly to distinguish UTF-8 text from other encodings, and doesn't
35 // switch the byte orderings.
37 public class UTF8Encoding : Encoding
40 bytes bits UTF-8 representation
41 ----- ---- -----------------------------------
43 2 11 110vvvvv 10vvvvvv
44 3 16 1110vvvv 10vvvvvv 10vvvvvv
45 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
46 ----- ---- -----------------------------------
49 Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
52 private const int UTF8_CODEPAGE = 65001;
54 // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230)
55 internal sealed class UTF8EncodingSealed : UTF8Encoding
57 public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
60 // Used by Encoding.UTF8 for lazy initialization
61 // The initialization code will not be run until a static member of the class is referenced
62 internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
64 // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
66 private bool _emitUTF8Identifier = false;
68 private bool _isThrowException = false;
71 public UTF8Encoding() : this(false)
76 public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
77 this(encoderShouldEmitUTF8Identifier, false)
82 public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
85 _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
86 _isThrowException = throwOnInvalidBytes;
88 // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
89 if (_isThrowException)
90 SetDefaultFallbacks();
93 internal override void SetDefaultFallbacks()
95 // For UTF-X encodings, we use a replacement fallback with an empty string
96 if (_isThrowException)
98 this.encoderFallback = EncoderFallback.ExceptionFallback;
99 this.decoderFallback = DecoderFallback.ExceptionFallback;
103 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
104 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
109 // WARNING: GetByteCount(string chars)
110 // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
111 // WARNING: otherwise it'll break VB's way of declaring these.
113 // The following methods are copied from EncodingNLS.cs.
114 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
115 // These should be kept in sync for the following classes:
116 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
118 // Returns the number of bytes required to encode a range of characters in
119 // a character array.
121 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
122 // So if you fix this, fix the others. Currently those include:
123 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
124 // parent method is safe
126 public override unsafe int GetByteCount(char[] chars, int index, int count)
128 // Validate input parameters
130 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
132 if (index < 0 || count < 0)
133 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
135 if (chars.Length - index < count)
136 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
137 Contract.EndContractBlock();
139 // If no input, return 0, avoid fixed empty array problem
143 // Just call the pointer version
144 fixed (char* pChars = chars)
145 return GetByteCount(pChars + index, count, null);
148 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
149 // So if you fix this, fix the others. Currently those include:
150 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
151 // parent method is safe
153 public override unsafe int GetByteCount(String chars)
157 throw new ArgumentNullException("s");
158 Contract.EndContractBlock();
160 fixed (char* pChars = chars)
161 return GetByteCount(pChars, chars.Length, null);
164 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
165 // So if you fix this, fix the others. Currently those include:
166 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
168 [CLSCompliant(false)]
169 public override unsafe int GetByteCount(char* chars, int count)
171 // Validate Parameters
173 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
176 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
177 Contract.EndContractBlock();
179 // Call it with empty encoder
180 return GetByteCount(chars, count, null);
183 // Parent method is safe.
184 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
185 // So if you fix this, fix the others. Currently those include:
186 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
188 public override unsafe int GetBytes(String s, int charIndex, int charCount,
189 byte[] bytes, int byteIndex)
191 if (s == null || bytes == null)
192 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
194 if (charIndex < 0 || charCount < 0)
195 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
197 if (s.Length - charIndex < charCount)
198 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
200 if (byteIndex < 0 || byteIndex > bytes.Length)
201 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
202 Contract.EndContractBlock();
204 int byteCount = bytes.Length - byteIndex;
206 // Fixed doesn't like 0 length arrays.
207 if (bytes.Length == 0)
210 fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
211 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
214 // Encodes a range of characters in a character array into a range of bytes
215 // in a byte array. An exception occurs if the byte array is not large
216 // enough to hold the complete encoding of the characters. The
217 // GetByteCount method can be used to determine the exact number of
218 // bytes that will be produced for a given range of characters.
219 // Alternatively, the GetMaxByteCount method can be used to
220 // determine the maximum number of bytes that will be produced for a given
221 // number of characters, regardless of the actual character values.
223 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
224 // So if you fix this, fix the others. Currently those include:
225 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
226 // parent method is safe
228 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
229 byte[] bytes, int byteIndex)
231 // Validate parameters
232 if (chars == null || bytes == null)
233 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
235 if (charIndex < 0 || charCount < 0)
236 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
238 if (chars.Length - charIndex < charCount)
239 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
241 if (byteIndex < 0 || byteIndex > bytes.Length)
242 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
243 Contract.EndContractBlock();
245 // If nothing to encode return 0, avoid fixed problem
249 // Just call pointer version
250 int byteCount = bytes.Length - byteIndex;
252 // Fixed doesn't like 0 length arrays.
253 if (bytes.Length == 0)
256 fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
257 // Remember that byteCount is # to decode, not size of array.
258 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
261 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
262 // So if you fix this, fix the others. Currently those include:
263 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
265 [CLSCompliant(false)]
266 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
268 // Validate Parameters
269 if (bytes == null || chars == null)
270 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
272 if (charCount < 0 || byteCount < 0)
273 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
274 Contract.EndContractBlock();
276 return GetBytes(chars, charCount, bytes, byteCount, null);
279 // Returns the number of characters produced by decoding a range of bytes
282 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
283 // So if you fix this, fix the others. Currently those include:
284 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
285 // parent method is safe
287 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
289 // Validate Parameters
291 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
293 if (index < 0 || count < 0)
294 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
296 if (bytes.Length - index < count)
297 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
298 Contract.EndContractBlock();
300 // If no input just return 0, fixed doesn't like 0 length arrays.
304 // Just call pointer version
305 fixed (byte* pBytes = bytes)
306 return GetCharCount(pBytes + index, count, null);
309 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
310 // So if you fix this, fix the others. Currently those include:
311 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
313 [CLSCompliant(false)]
314 public override unsafe int GetCharCount(byte* bytes, int count)
316 // Validate Parameters
318 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
321 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
322 Contract.EndContractBlock();
324 return GetCharCount(bytes, count, null);
327 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
328 // So if you fix this, fix the others. Currently those include:
329 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
330 // parent method is safe
332 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
333 char[] chars, int charIndex)
335 // Validate Parameters
336 if (bytes == null || chars == null)
337 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
339 if (byteIndex < 0 || byteCount < 0)
340 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
342 if ( bytes.Length - byteIndex < byteCount)
343 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
345 if (charIndex < 0 || charIndex > chars.Length)
346 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
347 Contract.EndContractBlock();
349 // If no input, return 0 & avoid fixed problem
353 // Just call pointer version
354 int charCount = chars.Length - charIndex;
356 // Fixed doesn't like 0 length arrays.
357 if (chars.Length == 0)
360 fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
361 // Remember that charCount is # to decode, not size of array
362 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
365 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
366 // So if you fix this, fix the others. Currently those include:
367 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
369 [CLSCompliant(false)]
370 public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
372 // Validate Parameters
373 if (bytes == null || chars == null)
374 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
376 if (charCount < 0 || byteCount < 0)
377 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
378 Contract.EndContractBlock();
380 return GetChars(bytes, byteCount, chars, charCount, null);
383 // Returns a string containing the decoded representation of a range of
384 // bytes in a byte array.
386 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
387 // So if you fix this, fix the others. Currently those include:
388 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
389 // parent method is safe
391 public override unsafe String GetString(byte[] bytes, int index, int count)
393 // Validate Parameters
395 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
397 if (index < 0 || count < 0)
398 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
400 if (bytes.Length - index < count)
401 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
402 Contract.EndContractBlock();
404 // Avoid problems with empty input buffer
405 if (count == 0) return String.Empty;
407 fixed (byte* pBytes = bytes)
408 return String.CreateStringFromEncoding(
409 pBytes + index, count, this);
413 // End of standard methods copied from EncodingNLS.cs
416 // To simplify maintenance, the structure of GetByteCount and GetBytes should be
417 // kept the same as much as possible
418 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
420 // For fallback we may need a fallback buffer.
421 // We wait to initialize it though in case we don't have any broken input unicode
422 EncoderFallbackBuffer fallbackBuffer = null;
423 char* pSrcForFallback;
426 char* pEnd = pSrc + count;
428 // Start by assuming we have as many as count
429 int byteCount = count;
433 if (baseEncoder != null)
435 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
436 ch = encoder.surrogateChar;
438 // We mustn't have left over fallback data when counting
439 if (encoder.InternalHasFallbackBuffer)
441 fallbackBuffer = encoder.FallbackBuffer;
442 if (fallbackBuffer.Remaining > 0)
443 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
445 // Set our internal fallback interesting things.
446 fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
452 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
457 // Unroll any fallback that happens at the end
458 ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
467 // Case of surrogates in the fallback.
468 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
470 Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
471 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
473 ch = fallbackBuffer.InternalGetNextChar();
476 if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
488 byteCount--; // ignore last one.
498 if (baseEncoder != null && !baseEncoder.MustFlush)
503 // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
510 Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
511 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
513 // use separate helper variables for local contexts so that the jit optimizations
514 // won't get confused about the variable lifetimes
517 // count the pending surrogate
520 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
521 // if (IsLowSurrogate(cha)) {
522 if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
524 // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
526 // ch = cha + (ch << 10) +
528 // - CharUnicodeInfo.LOW_SURROGATE_START
529 // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
531 // Use this next char
534 // else ch is still high surrogate and encoding will fail (so don't add count)
536 // attempt to encode the surrogate or partial surrogate
540 // If we've used a fallback, then we have to check for it
541 if (fallbackBuffer != null)
543 ch = fallbackBuffer.InternalGetNextChar();
546 // We have an extra byte we weren't expecting.
552 // read next char. The JIT optimization seems to be getting confused when
553 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
558 // if (IsHighSurrogate(ch)) {
559 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
561 // we will count this surrogate next time around
565 // either good char or partial surrogate
568 // throw exception on partial surrogate if necessary
569 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
570 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
572 // Lone surrogates aren't allowed
573 // Have to make a fallback buffer if we don't have one
574 if (fallbackBuffer == null)
576 // wait on fallbacks if we can
577 // For fallback we may need a fallback buffer
578 if (baseEncoder == null)
579 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
581 fallbackBuffer = baseEncoder.FallbackBuffer;
583 // Set our internal fallback interesting things.
584 fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
587 // Do our fallback. Actually we already know its a mixed up surrogate,
588 // so the ref pSrc isn't gonna do anything.
589 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
590 fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
591 pSrc = pSrcForFallback;
593 // Ignore it if we don't throw (we had preallocated this ch)
604 // the extra surrogate byte was compensated by the second surrogate character
605 // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
612 // check for overflow
620 // If still have fallback don't do fast loop
621 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
623 // We're reserving 1 byte for each char by default
628 int availableChars = PtrDiff(pEnd, pSrc);
630 // don't fall into the fast decoding loop if we don't have enough characters
631 if (availableChars <= 13)
633 // try to get over the remainder of the ascii characters fast though
634 char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
635 while (pSrc < pLocalEnd)
648 // make sure that we won't get a silent overflow inside the fast loop
649 // (Fall out to slow loop if we have this many characters)
650 availableChars &= 0x0FFFFFFF;
653 // To compute the upper bound, assume that all characters are ASCII characters at this point,
654 // the boundary will be decreased for every non-ASCII character we encounter
655 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
656 char* pStop = pSrc + availableChars - (3 + 4);
663 if (ch > 0x7F) // Not ASCII
665 if (ch > 0x7FF) // Not 2 Byte
667 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
675 if ((unchecked((int)pSrc) & 0x2) != 0)
679 if (ch > 0x7F) // Not ASCII
681 if (ch > 0x7FF) // Not 2 Byte
683 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
691 // Run 2 * 4 characters at a time!
695 int chc = *(int*)(pSrc + 2);
696 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
698 if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
700 goto LongCodeWithMask;
704 if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
706 if ((ch & unchecked((int)0xFF80)) != 0)
708 if ((chc & unchecked((int)0xFF800000)) != 0)
710 if ((chc & unchecked((int)0xFF80)) != 0)
716 chc = *(int*)(pSrc + 2);
717 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
719 if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
721 goto LongCodeWithMask;
724 if ((ch & unchecked((int)0xFF800000)) != 0)
726 if ((ch & unchecked((int)0xFF80)) != 0)
728 if ((chc & unchecked((int)0xFF800000)) != 0)
730 if ((chc & unchecked((int)0xFF80)) != 0)
739 // be careful about the sign extension
740 ch = (int)(((uint)ch) >> 16);
752 // use separate helper variables for slow and fast loop so that the jit optimizations
753 // won't get confused about the variable lifetimes
756 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
757 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
759 // 4 byte encoding - high surrogate + low surrogate
763 // !IsHighSurrogate(ch) // low without high -> bad
764 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
765 // !IsLowSurrogate(chd) // high not followed by low -> bad
766 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
768 // Back up and drop out to slow loop to figure out error
774 // byteCount - this byte is compensated by the second surrogate character
780 // byteCount - the last byte is already included
784 // no pending char at this point
789 // check for overflow
792 throw new ArgumentException(
793 SR.Argument_ConversionOverflow);
797 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
798 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
803 // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
804 // is good enough for us, and it tends to generate better code than the signed
805 // arithmetic generated by default
806 unsafe private static int PtrDiff(char* a, char* b)
808 return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
811 // byte* flavor just for parity
812 unsafe private static int PtrDiff(byte* a, byte* b)
817 private static bool InRange(int ch, int start, int end)
819 return (uint)(ch - start) <= (uint)(end - start);
823 // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
824 internal override unsafe int GetBytes(char* chars, int charCount,
825 byte* bytes, int byteCount, EncoderNLS baseEncoder)
827 Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
828 Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
829 Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
830 Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
832 UTF8Encoder encoder = null;
834 // For fallback we may need a fallback buffer.
835 // We wait to initialize it though in case we don't have any broken input unicode
836 EncoderFallbackBuffer fallbackBuffer = null;
837 char* pSrcForFallback;
840 byte* pTarget = bytes;
842 char* pEnd = pSrc + charCount;
843 byte* pAllocatedBufferEnd = pTarget + byteCount;
847 // assume that JIT will enregister pSrc, pTarget and ch
849 if (baseEncoder != null)
851 encoder = (UTF8Encoder)baseEncoder;
852 ch = encoder.surrogateChar;
854 // We mustn't have left over fallback data when counting
855 if (encoder.InternalHasFallbackBuffer)
857 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
858 fallbackBuffer = encoder.FallbackBuffer;
859 if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
860 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
862 // Set our internal fallback interesting things.
863 fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
869 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
875 // Check if there's anthing left to get out of the fallback buffer
876 ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
884 // Case of leftover surrogates in the fallback buffer
885 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
887 Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
888 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
892 ch = fallbackBuffer.InternalGetNextChar();
894 if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
896 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
910 // attempt to encode the partial surrogate (will fail or ignore)
911 if (ch > 0 && (encoder == null || encoder.MustFlush))
920 // We have a high surrogate left over from a previous loop.
921 Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
922 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
924 // use separate helper variables for local contexts so that the jit optimizations
925 // won't get confused about the variable lifetimes
928 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
929 // if (IsLowSurrogate(cha)) {
930 if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
932 ch = cha + (ch << 10) +
934 - CharUnicodeInfo.LOW_SURROGATE_START
935 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
939 // else ch is still high surrogate and encoding will fail
941 // attempt to encode the surrogate or partial surrogate
945 // If we've used a fallback, then we have to check for it
946 if (fallbackBuffer != null)
948 ch = fallbackBuffer.InternalGetNextChar();
949 if (ch > 0) goto ProcessChar;
952 // read next char. The JIT optimization seems to be getting confused when
953 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
958 // if (IsHighSurrogate(ch)) {
959 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
963 // either good char or partial surrogate
966 // throw exception on partial surrogate if necessary
967 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
968 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
970 // Lone surrogates aren't allowed, we have to do fallback for them
971 // Have to make a fallback buffer if we don't have one
972 if (fallbackBuffer == null)
974 // wait on fallbacks if we can
975 // For fallback we may need a fallback buffer
976 if (baseEncoder == null)
977 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
979 fallbackBuffer = baseEncoder.FallbackBuffer;
981 // Set our internal fallback interesting things.
982 fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
985 // Do our fallback. Actually we already know its a mixed up surrogate,
986 // so the ref pSrc isn't gonna do anything.
987 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
988 fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
989 pSrc = pSrcForFallback;
991 // Ignore it if we don't throw
996 // Count bytes needed
1004 bytesNeeded++; // 4 bytes (surrogate pair)
1006 bytesNeeded++; // 3 bytes (800-FFFF)
1008 bytesNeeded++; // 2 bytes (80-7FF)
1011 if (pTarget > pAllocatedBufferEnd - bytesNeeded)
1013 // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1014 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1016 fallbackBuffer.MovePrevious(); // Didn't use this fallback char
1018 fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
1022 pSrc--; // Didn't use this char
1024 pSrc--; // Was surrogate, didn't use 2nd part either
1026 Debug.Assert(pSrc >= chars || pTarget == bytes,
1027 "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1028 ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
1029 ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
1035 *pTarget = (byte)ch;
1039 // use separate helper variables for local contexts so that the jit optimizations
1040 // won't get confused about the variable lifetimes
1045 chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1051 chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1055 *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1058 chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1060 *pTarget = (byte)chb;
1063 chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1065 *pTarget = (byte)chb;
1068 *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1074 // If still have fallback don't do fast loop
1075 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1078 int availableChars = PtrDiff(pEnd, pSrc);
1079 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1081 // don't fall into the fast decoding loop if we don't have enough characters
1082 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1083 if (availableChars <= 13)
1085 // we are hoping for 1 byte per char
1086 if (availableBytes < availableChars)
1088 // not enough output room. no pending bits at this point
1093 // try to get over the remainder of the ascii characters fast though
1094 char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1095 while (pSrc < pLocalEnd)
1100 // Not ASCII, need more than 1 byte per char
1104 *pTarget = (byte)ch;
1107 // we are done, let ch be 0 to clear encoder
1112 // we need at least 1 byte per character, but Convert might allow us to convert
1113 // only part of the input, so try as much as we can. Reduce charCount if necessary
1114 if (availableBytes < availableChars)
1116 availableChars = availableBytes;
1120 // - optimistic range checks
1121 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1123 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1124 // the boundary will be decreased for every non-ASCII character we encounter
1125 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1126 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1127 char* pStop = pSrc + availableChars - 5;
1129 while (pSrc < pStop)
1138 *pTarget = (byte)ch;
1142 if ((unchecked((int)pSrc) & 0x2) != 0)
1150 *pTarget = (byte)ch;
1154 // Run 4 characters at a time!
1155 while (pSrc < pStop)
1158 int chc = *(int*)(pSrc + 2);
1159 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
1161 goto LongCodeWithMask;
1164 // Unfortunately, this is endianess sensitive
1166 *pTarget = (byte)(ch>>16);
1167 *(pTarget+1) = (byte)ch;
1169 *(pTarget+2) = (byte)(chc>>16);
1170 *(pTarget+3) = (byte)chc;
1173 *pTarget = (byte)ch;
1174 *(pTarget + 1) = (byte)(ch >> 16);
1176 *(pTarget + 2) = (byte)chc;
1177 *(pTarget + 3) = (byte)(chc >> 16);
1185 // be careful about the sign extension
1186 ch = (int)(((uint)ch) >> 16);
1196 *pTarget = (byte)ch;
1201 // use separate helper variables for slow and fast loop so that the jit optimizations
1202 // won't get confused about the variable lifetimes
1207 chd = unchecked((sbyte)0xC0) | (ch >> 6);
1211 // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1212 if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1215 chd = unchecked((sbyte)0xE0) | (ch >> 12);
1219 // 4 byte encoding - high surrogate + low surrogate
1220 // if (!IsHighSurrogate(ch))
1221 if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
1223 // low without high -> bad, try again in slow loop
1231 // if (!IsLowSurrogate(chd)) {
1232 if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1234 // high not followed by low -> bad, try again in slow loop
1239 ch = chd + (ch << 10) +
1241 - CharUnicodeInfo.LOW_SURROGATE_START
1242 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
1244 *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1245 // pStop - this byte is compensated by the second surrogate character
1246 // 2 input chars require 4 output bytes. 2 have been anticipated already
1247 // and 2 more will be accounted for by the 2 pStop-- calls below.
1250 chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1252 *pTarget = (byte)chd;
1253 pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1256 chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1258 *pTarget = (byte)chd;
1259 pStop--; // 2 byte sequence for 1 char so need pStop--.
1262 *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1263 // pStop - this byte is already included
1267 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1271 // no pending char at this point
1275 // Do we have to set the encoder bytes?
1276 if (encoder != null)
1278 Debug.Assert(!encoder.MustFlush || ch == 0,
1279 "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1281 encoder.surrogateChar = ch;
1282 encoder.m_charsUsed = (int)(pSrc - chars);
1285 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1286 baseEncoder == null || !baseEncoder.m_throwOnOverflow,
1287 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1289 return (int)(pTarget - bytes);
1293 // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1294 // while the actual character is being built in the lower bits. They are shifted together
1295 // with the actual bits of the character.
1297 // bits 30 & 31 are used for pending bits fixup
1298 private const int FinalByte = 1 << 29;
1299 private const int SupplimentarySeq = 1 << 28;
1300 private const int ThreeByteSeq = 1 << 27;
1302 // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
1303 // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1305 // To simplify maintenance, the structure of GetCharCount and GetChars should be
1306 // kept the same as much as possible
1307 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1309 Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1310 Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
1314 byte* pEnd = pSrc + count;
1316 // Start by assuming we have as many as count, charCount always includes the adjustment
1317 // for the character being decoded
1318 int charCount = count;
1320 DecoderFallbackBuffer fallback = null;
1322 if (baseDecoder != null)
1324 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1326 charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
1328 // Shouldn't have anything in fallback buffer for GetCharCount
1329 // (don't have to check m_throwOnOverflow for count)
1330 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1331 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1336 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1349 // read next byte. The JIT optimization seems to be getting confused when
1350 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1354 // we are expecting to see trailing bytes like 10vvvvvv
1355 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1357 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1358 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1360 charCount += (ch >> 30);
1361 goto InvalidByteSequence;
1364 // fold in the new byte
1365 ch = (ch << 6) | (cha & 0x3F);
1367 if ((ch & FinalByte) == 0)
1369 Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1370 "[UTF8Encoding.GetChars]Invariant volation");
1372 if ((ch & SupplimentarySeq) != 0)
1374 if ((ch & (FinalByte >> 6)) != 0)
1376 // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1380 // 2nd byte, check for non-shortest form of supplimentary char and the valid
1381 // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1382 if (!InRange(ch & 0x1F0, 0x10, 0x100))
1384 goto InvalidByteSequence;
1389 // Must be 2nd byte of a 3-byte sequence
1390 // check for non-shortest form of 3 byte seq
1391 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1392 (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
1394 goto InvalidByteSequence;
1402 // adjust for surrogates in non-shortest form
1403 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
1409 InvalidByteSequence:
1410 // this code fragment should be close to the gotos referencing it
1411 // Have to do fallback for invalid bytes
1412 if (fallback == null)
1414 if (baseDecoder == null)
1415 fallback = this.decoderFallback.CreateFallbackBuffer();
1417 fallback = baseDecoder.FallbackBuffer;
1418 fallback.InternalInitialize(bytes, null);
1420 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1432 // If its > 0x7F, its start of a new multi-byte sequence
1434 // Long sequence, so unreserve our char.
1437 // bit 6 has to be non-zero for start of multibyte chars.
1438 if ((ch & 0x40) == 0)
1440 // Unexpected trail byte
1441 goto InvalidByteSequence;
1444 // start a new long code
1445 if ((ch & 0x20) != 0)
1447 if ((ch & 0x10) != 0)
1449 // 4 byte encoding - supplimentary character (2 surrogates)
1453 // check that bit 4 is zero and the valid supplimentary character
1454 // range 0x000000 - 0x10FFFF at the same time
1458 goto InvalidByteSequence;
1461 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1462 // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1463 ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
1464 (1 << 30) | // If it dies on next byte we'll need an extra char
1465 (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
1466 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1467 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1469 // Our character count will be 2 characters for these 4 bytes, so subtract another char
1475 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1476 ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1477 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1479 // We'll expect 1 character for these 3 bytes, so subtract another char.
1489 // check for non-shortest form
1493 goto InvalidByteSequence;
1496 // Add bit flags so we'll be flagged correctly
1497 ch |= (FinalByte >> 6);
1505 int availableBytes = PtrDiff(pEnd, pSrc);
1507 // don't fall into the fast decoding loop if we don't have enough bytes
1508 if (availableBytes <= 13)
1510 // try to get over the remainder of the ascii characters fast though
1511 byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1512 while (pSrc < pLocalEnd)
1525 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1526 // the boundary will be decreased for every non-ASCII character we encounter
1527 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1528 byte* pStop = pSrc + availableBytes - 7;
1530 while (pSrc < pStop)
1540 // get pSrc 2-byte aligned
1541 if ((unchecked((int)pSrc) & 0x1) != 0)
1551 // get pSrc 4-byte aligned
1552 if ((unchecked((int)pSrc) & 0x2) != 0)
1554 ch = *(ushort*)pSrc;
1555 if ((ch & 0x8080) != 0)
1557 goto LongCodeWithMask16;
1562 // Run 8 + 8 characters at a time!
1563 while (pSrc < pStop)
1566 int chb = *(int*)(pSrc + 4);
1567 if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1569 goto LongCodeWithMask32;
1573 // This is a really small loop - unroll it
1578 chb = *(int*)(pSrc + 4);
1579 if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1581 goto LongCodeWithMask32;
1589 // be careful about the sign extension
1590 ch = (int)(((uint)ch) >> 16);
1592 ch = (int)(((uint)ch) >> 8);
1609 // bit 6 has to be zero
1611 // we are expecting to see trailing bytes like 10vvvvvv
1612 (chc & unchecked((sbyte)0xC0)) != 0x80)
1619 // start a new long code
1620 if ((ch & 0x20) != 0)
1622 // fold the first two bytes together
1623 chc |= (ch & 0x0F) << 6;
1625 if ((ch & 0x10) != 0)
1627 // 4 byte encoding - surrogate
1630 // check that bit 4 is zero, the non-shortest form of surrogate
1631 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1632 !InRange(chc >> 4, 0x01, 0x10) ||
1633 // we are expecting to see trailing bytes like 10vvvvvv
1634 (ch & unchecked((sbyte)0xC0)) != 0x80)
1639 chc = (chc << 6) | (ch & 0x3F);
1642 // we are expecting to see trailing bytes like 10vvvvvv
1643 if ((ch & unchecked((sbyte)0xC0)) != 0x80)
1657 // check for non-shortest form of 3 byte seq
1658 (chc & (0x1F << 5)) == 0 ||
1659 // Can't have surrogates here.
1660 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1661 // we are expecting to see trailing bytes like 10vvvvvv
1662 (ch & unchecked((sbyte)0xC0)) != 0x80)
1676 // check for non-shortest form
1677 if ((ch & 0x1E) == 0)
1688 // no pending bits at this point
1698 // May have a problem if we have to flush
1701 // We were already adjusting for these, so need to unadjust
1702 charCount += (ch >> 30);
1703 if (baseDecoder == null || baseDecoder.MustFlush)
1705 // Have to do fallback for invalid bytes
1706 if (fallback == null)
1708 if (baseDecoder == null)
1709 fallback = this.decoderFallback.CreateFallbackBuffer();
1711 fallback = baseDecoder.FallbackBuffer;
1712 fallback.InternalInitialize(bytes, null);
1714 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1718 // Shouldn't have anything in fallback buffer for GetCharCount
1719 // (don't have to check m_throwOnOverflow for count)
1720 Debug.Assert(fallback == null || fallback.Remaining == 0,
1721 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1726 // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
1727 // So if we're really broken, then that could also throw an error... recursively.
1728 // So try to make sure GetChars can at least process all uses by
1729 // System.Resources.ResourceReader!
1731 // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
1732 // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1734 // To simplify maintenance, the structure of GetCharCount and GetChars should be
1735 // kept the same as much as possible
1736 internal override unsafe int GetChars(byte* bytes, int byteCount,
1737 char* chars, int charCount, DecoderNLS baseDecoder)
1739 Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
1740 Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
1741 Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1742 Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
1745 char* pTarget = chars;
1747 byte* pEnd = pSrc + byteCount;
1748 char* pAllocatedBufferEnd = pTarget + charCount;
1752 DecoderFallbackBuffer fallback = null;
1753 byte* pSrcForFallback;
1754 char* pTargetForFallback;
1755 if (baseDecoder != null)
1757 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1760 // Shouldn't have anything in fallback buffer for GetChars
1761 // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
1762 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1763 "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1768 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1781 // read next byte. The JIT optimization seems to be getting confused when
1782 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1786 // we are expecting to see trailing bytes like 10vvvvvv
1787 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1789 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1790 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1792 goto InvalidByteSequence;
1795 // fold in the new byte
1796 ch = (ch << 6) | (cha & 0x3F);
1798 if ((ch & FinalByte) == 0)
1800 // Not at last byte yet
1801 Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1802 "[UTF8Encoding.GetChars]Invariant volation");
1804 if ((ch & SupplimentarySeq) != 0)
1806 // Its a 4-byte supplimentary sequence
1807 if ((ch & (FinalByte >> 6)) != 0)
1809 // this is 3rd byte of 4 byte sequence - nothing to do
1813 // 2nd byte of 4 bytes
1814 // check for non-shortest form of surrogate and the valid surrogate
1815 // range 0x000000 - 0x10FFFF at the same time
1816 if (!InRange(ch & 0x1F0, 0x10, 0x100))
1818 goto InvalidByteSequence;
1823 // Must be 2nd byte of a 3-byte sequence
1824 // check for non-shortest form of 3 byte seq
1825 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1826 (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
1828 goto InvalidByteSequence;
1836 // surrogate in shortest form?
1837 // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1838 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
1840 // let the range check for the second char throw the exception
1841 if (pTarget < pAllocatedBufferEnd)
1843 *pTarget = (char)(((ch >> 10) & 0x7FF) +
1844 unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
1848 unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1854 InvalidByteSequence:
1855 // this code fragment should be close to the gotos referencing it
1856 // Have to do fallback for invalid bytes
1857 if (fallback == null)
1859 if (baseDecoder == null)
1860 fallback = this.decoderFallback.CreateFallbackBuffer();
1862 fallback = baseDecoder.FallbackBuffer;
1863 fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1865 // This'll back us up the appropriate # of bytes if we didn't get anywhere
1866 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
1867 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
1868 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
1869 pSrc = pSrcForFallback;
1870 pTarget = pTargetForFallback;
1872 if (!fallbackResult)
1874 // Ran out of buffer space
1875 // Need to throw an exception?
1876 Debug.Assert(pSrc >= bytes || pTarget == chars,
1877 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1878 fallback.InternalReset();
1879 ThrowCharsOverflow(baseDecoder, pTarget == chars);
1883 Debug.Assert(pSrc >= bytes,
1884 "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1895 // If its > 0x7F, its start of a new multi-byte sequence
1897 // bit 6 has to be non-zero
1898 if ((ch & 0x40) == 0)
1900 goto InvalidByteSequence;
1903 // start a new long code
1904 if ((ch & 0x20) != 0)
1906 if ((ch & 0x10) != 0)
1908 // 4 byte encoding - supplimentary character (2 surrogates)
1912 // check that bit 4 is zero and the valid supplimentary character
1913 // range 0x000000 - 0x10FFFF at the same time
1917 goto InvalidByteSequence;
1920 ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1921 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1922 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1927 ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1928 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1937 // check for non-shortest form
1941 goto InvalidByteSequence;
1944 ch |= (FinalByte >> 6);
1950 // write the pending character
1951 if (pTarget >= pAllocatedBufferEnd)
1953 // Fix chars so we make sure to throw if we didn't output anything
1959 if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1960 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1962 pSrc--; // It was 4 bytes
1963 pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
1965 else if (ch > 0xffff)
1967 pSrc--; // It was 4 bytes, nothing was stored
1969 pSrc--; // It was at least 3 bytes
1971 pSrc--; // It was at least 2 bytes
1975 // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1976 // a 4 byte sequence alredy)
1977 Debug.Assert(pSrc >= bytes || pTarget == chars,
1978 "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1979 ThrowCharsOverflow(baseDecoder, pTarget == chars);
1981 // Don't store ch in decoder, we already backed up to its start
1984 // Didn't throw, just use this buffer size.
1987 *pTarget = (char)ch;
1991 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1992 int availableBytes = PtrDiff(pEnd, pSrc);
1994 // don't fall into the fast decoding loop if we don't have enough bytes
1995 // Test for availableChars is done because pStop would be <= pTarget.
1996 if (availableBytes <= 13)
1998 // we may need as many as 1 character per byte
1999 if (availableChars < availableBytes)
2001 // not enough output room. no pending bits at this point
2006 // try to get over the remainder of the ascii characters fast though
2007 byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2008 while (pSrc < pLocalEnd)
2016 *pTarget = (char)ch;
2024 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
2025 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
2026 if (availableChars < availableBytes)
2028 availableBytes = availableChars;
2031 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2032 // the boundary will be decreased for every non-ASCII character we encounter
2033 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
2034 char* pStop = pTarget + availableBytes - 7;
2036 while (pTarget < pStop)
2045 *pTarget = (char)ch;
2048 // get pSrc to be 2-byte aligned
2049 if ((unchecked((int)pSrc) & 0x1) != 0)
2057 *pTarget = (char)ch;
2061 // get pSrc to be 4-byte aligned
2062 if ((unchecked((int)pSrc) & 0x2) != 0)
2064 ch = *(ushort*)pSrc;
2065 if ((ch & 0x8080) != 0)
2067 goto LongCodeWithMask16;
2070 // Unfortunately, this is endianess sensitive
2072 *pTarget = (char)((ch >> 8) & 0x7F);
2074 *(pTarget+1) = (char)(ch & 0x7F);
2077 *pTarget = (char)(ch & 0x7F);
2079 *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2084 // Run 8 characters at a time!
2085 while (pTarget < pStop)
2088 int chb = *(int*)(pSrc + 4);
2089 if (((ch | chb) & unchecked((int)0x80808080)) != 0)
2091 goto LongCodeWithMask32;
2094 // Unfortunately, this is endianess sensitive
2096 *pTarget = (char)((ch >> 24) & 0x7F);
2097 *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2098 *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2099 *(pTarget+3) = (char)(ch & 0x7F);
2101 *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2102 *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2103 *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2104 *(pTarget+7) = (char)(chb & 0x7F);
2107 *pTarget = (char)(ch & 0x7F);
2108 *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2109 *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
2110 *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
2112 *(pTarget + 4) = (char)(chb & 0x7F);
2113 *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
2114 *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
2115 *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
2123 // be careful about the sign extension
2124 ch = (int)(((uint)ch) >> 16);
2126 ch = (int)(((uint)ch) >> 8);
2135 *pTarget = (char)ch;
2145 // bit 6 has to be zero
2147 // we are expecting to see trailing bytes like 10vvvvvv
2148 (chc & unchecked((sbyte)0xC0)) != 0x80)
2155 // start a new long code
2156 if ((ch & 0x20) != 0)
2158 // fold the first two bytes together
2159 chc |= (ch & 0x0F) << 6;
2161 if ((ch & 0x10) != 0)
2163 // 4 byte encoding - surrogate
2166 // check that bit 4 is zero, the non-shortest form of surrogate
2167 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2168 !InRange(chc >> 4, 0x01, 0x10) ||
2169 // we are expecting to see trailing bytes like 10vvvvvv
2170 (ch & unchecked((sbyte)0xC0)) != 0x80)
2175 chc = (chc << 6) | (ch & 0x3F);
2178 // we are expecting to see trailing bytes like 10vvvvvv
2179 if ((ch & unchecked((sbyte)0xC0)) != 0x80)
2185 ch = (chc << 6) | (ch & 0x3F);
2187 *pTarget = (char)(((ch >> 10) & 0x7FF) +
2188 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
2192 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2194 // extra byte, we're already planning 2 chars for 2 of these bytes,
2195 // but the big loop is testing the target against pStop, so we need
2196 // to subtract 2 more or we risk overrunning the input. Subtract
2197 // one here and one below.
2205 // check for non-shortest form of 3 byte seq
2206 (chc & (0x1F << 5)) == 0 ||
2207 // Can't have surrogates here.
2208 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2209 // we are expecting to see trailing bytes like 10vvvvvv
2210 (ch & unchecked((sbyte)0xC0)) != 0x80)
2216 ch = (chc << 6) | (ch & 0x3F);
2218 // extra byte, we're only expecting 1 char for each of these 3 bytes,
2219 // but the loop is testing the target (not source) against pStop, so
2220 // we need to subtract 2 more or we risk overrunning the input.
2221 // Subtract 1 here and one more below
2231 // check for non-shortest form
2236 ch = (ch << 6) | chc;
2239 *pTarget = (char)ch;
2242 // extra byte, we're only expecting 1 char for each of these 2 bytes,
2243 // but the loop is testing the target (not source) against pStop.
2244 // subtract an extra count from pStop so that we don't overrun the input.
2249 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2251 // no pending bits at this point
2261 if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2263 // Have to do fallback for invalid bytes
2264 if (fallback == null)
2266 if (baseDecoder == null)
2267 fallback = this.decoderFallback.CreateFallbackBuffer();
2269 fallback = baseDecoder.FallbackBuffer;
2270 fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2273 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2274 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
2275 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
2276 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
2277 pSrc = pSrcForFallback;
2278 pTarget = pTargetForFallback;
2280 if (!fallbackResult)
2282 Debug.Assert(pSrc >= bytes || pTarget == chars,
2283 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2285 // Ran out of buffer space
2286 // Need to throw an exception?
2287 fallback.InternalReset();
2288 ThrowCharsOverflow(baseDecoder, pTarget == chars);
2290 Debug.Assert(pSrc >= bytes,
2291 "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2295 if (baseDecoder != null)
2297 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2299 // If we're storing flush data we expect all bits to be used or else
2300 // we're stuck in the middle of a conversion
2301 Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
2302 "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2304 // Remember our leftover bits.
2307 baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
2310 // Shouldn't have anything in fallback buffer for GetChars
2311 // (don't have to check m_throwOnOverflow for chars)
2312 Debug.Assert(fallback == null || fallback.Remaining == 0,
2313 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2315 return PtrDiff(pTarget, chars);
2318 // During GetChars we had an invalid byte sequence
2319 // pSrc is backed up to the start of the bad sequence if we didn't have room to
2320 // fall it back. Otherwise pSrc remains wher it is.
2321 private unsafe bool FallbackInvalidByteSequence(
2322 ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2325 byte* pStart = pSrc;
2326 byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2328 // Do the actual fallback
2329 if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2331 // Oops, it failed, back up to pStart
2340 // During GetCharCount we had an invalid byte sequence
2341 // pSrc is used to find the index that points to the invalid bytes,
2342 // however the byte[] contains the fallback bytes (in case the index is -1)
2343 private unsafe int FallbackInvalidByteSequence(
2344 byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2347 byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2349 // Do the actual fallback
2350 int count = fallback.InternalFallback(bytesUnknown, pSrc);
2352 // # of fallback chars expected.
2353 // Note that we only get here for "long" sequences, and have already unreserved
2354 // the count that we prereserved for the input bytes
2358 // Note that some of these bytes may have come from a previous fallback, so we cannot
2359 // just decrement the pointer and use the values we read. In those cases we have
2360 // to regenerate the original values.
2361 private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2364 byte[] bytesUnknown = null;
2366 // See if it was a plain char
2367 // (have to check >= 0 because we have all sorts of wierd bit flags)
2368 if (ch < 0x100 && ch >= 0)
2371 bytesUnknown = new byte[] { unchecked((byte)ch) };
2373 // See if its an unfinished 2 byte sequence
2374 else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2377 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
2379 // So now we're either 2nd byte of 3 or 4 byte sequence or
2380 // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2381 // 1st check if its a 4 byte sequence
2382 else if ((ch & SupplimentarySeq) != 0)
2384 // 3rd byte of 4 byte sequence?
2385 if ((ch & (FinalByte >> 6)) != 0)
2387 // 3rd byte of 4 byte sequence
2389 bytesUnknown = new byte[] {
2390 unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2391 unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2392 unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2394 else if ((ch & (FinalByte >> 12)) != 0)
2396 // 2nd byte of a 4 byte sequence
2398 bytesUnknown = new byte[] {
2399 unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2400 unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2404 // 4th byte of a 4 byte sequence
2406 bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
2411 // 2nd byte of 3 byte sequence?
2412 if ((ch & (FinalByte >> 6)) != 0)
2414 // So its 2nd byte of a 3 byte sequence
2416 bytesUnknown = new byte[] {
2417 unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2421 // 1st byte of a 3 byte sequence
2423 bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
2427 return bytesUnknown;
2431 public override Decoder GetDecoder()
2433 return new UTF8Decoder(this);
2437 public override Encoder GetEncoder()
2439 return new UTF8Encoder(this);
2443 public override int GetMaxByteCount(int charCount)
2446 throw new ArgumentOutOfRangeException(nameof(charCount),
2447 SR.ArgumentOutOfRange_NeedNonNegNum);
2448 Contract.EndContractBlock();
2450 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2451 long byteCount = (long)charCount + 1;
2453 if (EncoderFallback.MaxCharCount > 1)
2454 byteCount *= EncoderFallback.MaxCharCount;
2456 // Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
2459 if (byteCount > 0x7fffffff)
2460 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
2462 return (int)byteCount;
2466 public override int GetMaxCharCount(int byteCount)
2469 throw new ArgumentOutOfRangeException(nameof(byteCount),
2470 SR.ArgumentOutOfRange_NeedNonNegNum);
2471 Contract.EndContractBlock();
2473 // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2474 long charCount = ((long)byteCount + 1);
2476 // Non-shortest form would fall back, so get max count from fallback.
2477 // So would 11... followed by 11..., so you could fall back every byte
2478 if (DecoderFallback.MaxCharCount > 1)
2480 charCount *= DecoderFallback.MaxCharCount;
2483 if (charCount > 0x7fffffff)
2484 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
2486 return (int)charCount;
2490 public override byte[] GetPreamble()
2492 if (_emitUTF8Identifier)
2494 // Allocate new array to prevent users from modifying it.
2495 return new byte[3] { 0xEF, 0xBB, 0xBF };
2498 return Array.Empty<byte>();
2502 public override bool Equals(Object value)
2504 UTF8Encoding that = value as UTF8Encoding;
2507 return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
2508 (EncoderFallback.Equals(that.EncoderFallback)) &&
2509 (DecoderFallback.Equals(that.DecoderFallback));
2515 public override int GetHashCode()
2517 //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2518 return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2519 UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
2522 private sealed class UTF8Encoder : EncoderNLS, ISerializable
2524 // We must save a high surrogate value until the next call, looking
2525 // for a low surrogate value.
2526 internal int surrogateChar;
2528 public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2533 // ISerializable implementation
2534 void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2536 throw new PlatformNotSupportedException();
2539 public override void Reset()
2542 this.surrogateChar = 0;
2543 if (m_fallbackBuffer != null)
2544 m_fallbackBuffer.Reset();
2547 // Anything left in our encoder?
2548 internal override bool HasState
2552 return (this.surrogateChar != 0);
2557 private sealed class UTF8Decoder : DecoderNLS, ISerializable
2559 // We'll need to remember the previous information. See the comments around definition
2560 // of FinalByte for details.
2563 public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2568 // Constructor called by serialization, have to handle deserializing from Everett
2569 internal UTF8Decoder(SerializationInfo info, StreamingContext context)
2571 throw new PlatformNotSupportedException();
2574 // ISerializable implementation, get data for this object
2575 void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2577 throw new PlatformNotSupportedException();
2580 public override void Reset()
2583 if (m_fallbackBuffer != null)
2584 m_fallbackBuffer.Reset();
2587 // Anything left in our decoder?
2588 internal override bool HasState
2592 return (this.bits != 0);