Encoding code clean up (#12864)
[platform/upstream/coreclr.git] / src / mscorlib / shared / System / Text / UTF8Encoding.cs
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 // The worker functions in this file was optimized for performance. If you make changes
6 // you should use care to consider all of the interesting cases.
7
8 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
9 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
10 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
11 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
12
13 // This define can be used to turn off the fast loops. Useful for finding whether
14 // the problem is fastloop-specific.
15 #define FASTLOOP
16
17 using System;
18 using System.Diagnostics;
19 using System.Diagnostics.Contracts;
20 using System.Globalization;
21
22 namespace System.Text
23 {
24     // Encodes text into and out of UTF-8.  UTF-8 is a way of writing
25     // Unicode characters with variable numbers of bytes per character,
26     // optimized for the lower 127 ASCII characters.  It's an efficient way
27     // of encoding US English in an internationalizable way.
28     //
29     // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
30     //
31     // The UTF-8 byte order mark is simply the Unicode byte order mark
32     // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark is
33     // used mostly to distinguish UTF-8 text from other encodings, and doesn't
34     // switch the byte orderings.
35
36     public class UTF8Encoding : Encoding
37     {
38         /*
39             bytes   bits    UTF-8 representation
40             -----   ----    -----------------------------------
41             1        7      0vvvvvvv
42             2       11      110vvvvv 10vvvvvv
43             3       16      1110vvvv 10vvvvvv 10vvvvvv
44             4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
45             -----   ----    -----------------------------------
46
47             Surrogate:
48             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
49         */
50
51         private const int UTF8_CODEPAGE = 65001;
52
53         // Allow for de-virtualization (see https://github.com/dotnet/coreclr/pull/9230)
54         internal sealed class UTF8EncodingSealed : UTF8Encoding
55         {
56             public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
57         }
58
59         // Used by Encoding.UTF8 for lazy initialization
60         // The initialization code will not be run until a static member of the class is referenced
61         internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
62
63         // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
64         // the standard.
65         private bool _emitUTF8Identifier = false;
66
67         private bool _isThrowException = false;
68
69
70         public UTF8Encoding() : this(false)
71         {
72         }
73
74
75         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
76             this(encoderShouldEmitUTF8Identifier, false)
77         {
78         }
79
80
81         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
82             base(UTF8_CODEPAGE)
83         {
84             _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
85             _isThrowException = throwOnInvalidBytes;
86
87             // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
88             if (_isThrowException)
89                 SetDefaultFallbacks();
90         }
91
92         internal override void SetDefaultFallbacks()
93         {
94             // For UTF-X encodings, we use a replacement fallback with an empty string
95             if (_isThrowException)
96             {
97                 this.encoderFallback = EncoderFallback.ExceptionFallback;
98                 this.decoderFallback = DecoderFallback.ExceptionFallback;
99             }
100             else
101             {
102                 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
103                 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
104             }
105         }
106
107
108         // WARNING: GetByteCount(string chars)
109         // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
110         // WARNING: otherwise it'll break VB's way of declaring these.
111         //
112         // The following methods are copied from EncodingNLS.cs.
113         // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
114         // These should be kept in sync for the following classes:
115         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
116
117         // Returns the number of bytes required to encode a range of characters in
118         // a character array.
119         //
120         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
121         // So if you fix this, fix the others.  Currently those include:
122         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
123         // parent method is safe
124
125         public override unsafe int GetByteCount(char[] chars, int index, int count)
126         {
127             // Validate input parameters
128             if (chars == null)
129                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
130
131             if (index < 0 || count < 0)
132                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
133
134             if (chars.Length - index < count)
135                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
136             Contract.EndContractBlock();
137
138             // If no input, return 0, avoid fixed empty array problem
139             if (count == 0)
140                 return 0;
141
142             // Just call the pointer version
143             fixed (char* pChars = chars)
144                 return GetByteCount(pChars + index, count, null);
145         }
146
147         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
148         // So if you fix this, fix the others.  Currently those include:
149         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
150         // parent method is safe
151
152         public override unsafe int GetByteCount(String chars)
153         {
154             // Validate input
155             if (chars==null)
156                 throw new ArgumentNullException("s");
157             Contract.EndContractBlock();
158
159             fixed (char* pChars = chars)
160                 return GetByteCount(pChars, chars.Length, null);
161         }
162
163         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
164         // So if you fix this, fix the others.  Currently those include:
165         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
166
167         [CLSCompliant(false)]
168         public override unsafe int GetByteCount(char* chars, int count)
169         {
170             // Validate Parameters
171             if (chars == null)
172                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
173
174             if (count < 0)
175                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
176             Contract.EndContractBlock();
177
178             // Call it with empty encoder
179             return GetByteCount(chars, count, null);
180         }
181
182         // Parent method is safe.
183         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
184         // So if you fix this, fix the others.  Currently those include:
185         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
186
187         public override unsafe int GetBytes(String s, int charIndex, int charCount,
188                                               byte[] bytes, int byteIndex)
189         {
190             if (s == null || bytes == null)
191                 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
192
193             if (charIndex < 0 || charCount < 0)
194                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
195
196             if (s.Length - charIndex < charCount)
197                 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
198
199             if (byteIndex < 0 || byteIndex > bytes.Length)
200                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
201             Contract.EndContractBlock();
202
203             int byteCount = bytes.Length - byteIndex;
204
205             // Fixed doesn't like 0 length arrays.
206             if (bytes.Length == 0)
207                 bytes = new byte[1];
208
209             fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
210                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
211         }
212
213         // Encodes a range of characters in a character array into a range of bytes
214         // in a byte array. An exception occurs if the byte array is not large
215         // enough to hold the complete encoding of the characters. The
216         // GetByteCount method can be used to determine the exact number of
217         // bytes that will be produced for a given range of characters.
218         // Alternatively, the GetMaxByteCount method can be used to
219         // determine the maximum number of bytes that will be produced for a given
220         // number of characters, regardless of the actual character values.
221         //
222         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
223         // So if you fix this, fix the others.  Currently those include:
224         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
225         // parent method is safe
226
227         public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
228                                                byte[] bytes, int byteIndex)
229         {
230             // Validate parameters
231             if (chars == null || bytes == null)
232                 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
233
234             if (charIndex < 0 || charCount < 0)
235                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
236
237             if (chars.Length - charIndex < charCount)
238                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
239
240             if (byteIndex < 0 || byteIndex > bytes.Length)
241                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
242             Contract.EndContractBlock();
243
244             // If nothing to encode return 0, avoid fixed problem
245             if (charCount == 0)
246                 return 0;
247
248             // Just call pointer version
249             int byteCount = bytes.Length - byteIndex;
250
251             // Fixed doesn't like 0 length arrays.
252             if (bytes.Length == 0)
253                 bytes = new byte[1];
254
255             fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
256                 // Remember that byteCount is # to decode, not size of array.
257                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
258         }
259
260         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
261         // So if you fix this, fix the others.  Currently those include:
262         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
263
264         [CLSCompliant(false)]
265         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
266         {
267             // Validate Parameters
268             if (bytes == null || chars == null)
269                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
270
271             if (charCount < 0 || byteCount < 0)
272                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
273             Contract.EndContractBlock();
274
275             return GetBytes(chars, charCount, bytes, byteCount, null);
276         }
277
278         // Returns the number of characters produced by decoding a range of bytes
279         // in a byte array.
280         //
281         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
282         // So if you fix this, fix the others.  Currently those include:
283         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
284         // parent method is safe
285
286         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
287         {
288             // Validate Parameters
289             if (bytes == null)
290                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
291
292             if (index < 0 || count < 0)
293                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
294
295             if (bytes.Length - index < count)
296                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
297             Contract.EndContractBlock();
298
299             // If no input just return 0, fixed doesn't like 0 length arrays.
300             if (count == 0)
301                 return 0;
302
303             // Just call pointer version
304             fixed (byte* pBytes = bytes)
305                 return GetCharCount(pBytes + index, count, null);
306         }
307
308         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
309         // So if you fix this, fix the others.  Currently those include:
310         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
311
312         [CLSCompliant(false)]
313         public override unsafe int GetCharCount(byte* bytes, int count)
314         {
315             // Validate Parameters
316             if (bytes == null)
317                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
318
319             if (count < 0)
320                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
321             Contract.EndContractBlock();
322
323             return GetCharCount(bytes, count, null);
324         }
325
326         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
327         // So if you fix this, fix the others.  Currently those include:
328         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
329         // parent method is safe
330
331         public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
332                                               char[] chars, int charIndex)
333         {
334             // Validate Parameters
335             if (bytes == null || chars == null)
336                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
337
338             if (byteIndex < 0 || byteCount < 0)
339                 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
340
341             if ( bytes.Length - byteIndex < byteCount)
342                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
343
344             if (charIndex < 0 || charIndex > chars.Length)
345                 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
346             Contract.EndContractBlock();
347
348             // If no input, return 0 & avoid fixed problem
349             if (byteCount == 0)
350                 return 0;
351
352             // Just call pointer version
353             int charCount = chars.Length - charIndex;
354
355             // Fixed doesn't like 0 length arrays.
356             if (chars.Length == 0)
357                 chars = new char[1];
358
359             fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
360                 // Remember that charCount is # to decode, not size of array
361                 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
362         }
363
364         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
365         // So if you fix this, fix the others.  Currently those include:
366         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
367
368         [CLSCompliant(false)]
369         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
370         {
371             // Validate Parameters
372             if (bytes == null || chars == null)
373                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
374
375             if (charCount < 0 || byteCount < 0)
376                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
377             Contract.EndContractBlock();
378
379             return GetChars(bytes, byteCount, chars, charCount, null);
380         }
381
382         // Returns a string containing the decoded representation of a range of
383         // bytes in a byte array.
384         //
385         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
386         // So if you fix this, fix the others.  Currently those include:
387         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
388         // parent method is safe
389
390         public override unsafe String GetString(byte[] bytes, int index, int count)
391         {
392             // Validate Parameters
393             if (bytes == null)
394                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
395
396             if (index < 0 || count < 0)
397                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
398
399             if (bytes.Length - index < count)
400                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
401             Contract.EndContractBlock();
402
403             // Avoid problems with empty input buffer
404             if (count == 0) return String.Empty;
405
406             fixed (byte* pBytes = bytes)
407                 return String.CreateStringFromEncoding(
408                     pBytes + index, count, this);
409         }
410
411         //
412         // End of standard methods copied from EncodingNLS.cs
413         //
414
415         // To simplify maintenance, the structure of GetByteCount and GetBytes should be
416         // kept the same as much as possible
417         internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
418         {
419             // For fallback we may need a fallback buffer.
420             // We wait to initialize it though in case we don't have any broken input unicode
421             EncoderFallbackBuffer fallbackBuffer = null;
422             char* pSrcForFallback;
423
424             char* pSrc = chars;
425             char* pEnd = pSrc + count;
426
427             // Start by assuming we have as many as count
428             int byteCount = count;
429
430             int ch = 0;
431
432             if (baseEncoder != null)
433             {
434                 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
435                 ch = encoder.surrogateChar;
436
437                 // We mustn't have left over fallback data when counting
438                 if (encoder.InternalHasFallbackBuffer)
439                 {
440                     fallbackBuffer = encoder.FallbackBuffer;
441                     if (fallbackBuffer.Remaining > 0)
442                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
443
444                     // Set our internal fallback interesting things.
445                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
446                 }
447             }
448
449             for (;;)
450             {
451                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
452                 if (pSrc >= pEnd)
453                 {
454                     if (ch == 0)
455                     {
456                         // Unroll any fallback that happens at the end
457                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
458                         if (ch > 0)
459                         {
460                             byteCount++;
461                             goto ProcessChar;
462                         }
463                     }
464                     else
465                     {
466                         // Case of surrogates in the fallback.
467                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
468                         {
469                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
470                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
471
472                             ch = fallbackBuffer.InternalGetNextChar();
473                             byteCount++;
474
475                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
476                             {
477                                 ch = 0xfffd;
478                                 byteCount++;
479                                 goto EncodeChar;
480                             }
481                             else if (ch > 0)
482                             {
483                                 goto ProcessChar;
484                             }
485                             else
486                             {
487                                 byteCount--; // ignore last one.
488                                 break;
489                             }
490                         }
491                     }
492
493                     if (ch <= 0)
494                     {
495                         break;
496                     }
497                     if (baseEncoder != null && !baseEncoder.MustFlush)
498                     {
499                         break;
500                     }
501
502                     // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
503                     byteCount++;
504                     goto EncodeChar;
505                 }
506
507                 if (ch > 0)
508                 {
509                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
510                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
511
512                     // use separate helper variables for local contexts so that the jit optimizations
513                     // won't get confused about the variable lifetimes
514                     int cha = *pSrc;
515
516                     // count the pending surrogate
517                     byteCount++;
518
519                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
520                     // if (IsLowSurrogate(cha)) {
521                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
522                     {
523                         // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
524                         ch = 0xfffd;
525                         //                        ch = cha + (ch << 10) +
526                         //                            (0x10000
527                         //                            - CharUnicodeInfo.LOW_SURROGATE_START
528                         //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
529
530                         // Use this next char
531                         pSrc++;
532                     }
533                     // else ch is still high surrogate and encoding will fail (so don't add count)
534
535                     // attempt to encode the surrogate or partial surrogate
536                     goto EncodeChar;
537                 }
538
539                 // If we've used a fallback, then we have to check for it
540                 if (fallbackBuffer != null)
541                 {
542                     ch = fallbackBuffer.InternalGetNextChar();
543                     if (ch > 0)
544                     {
545                         // We have an extra byte we weren't expecting.
546                         byteCount++;
547                         goto ProcessChar;
548                     }
549                 }
550
551                 // read next char. The JIT optimization seems to be getting confused when
552                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
553                 ch = *pSrc;
554                 pSrc++;
555
556             ProcessChar:
557                 // if (IsHighSurrogate(ch)) {
558                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
559                 {
560                     // we will count this surrogate next time around
561                     byteCount--;
562                     continue;
563                 }
564             // either good char or partial surrogate
565
566             EncodeChar:
567                 // throw exception on partial surrogate if necessary
568                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
569                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
570                 {
571                     // Lone surrogates aren't allowed
572                     // Have to make a fallback buffer if we don't have one
573                     if (fallbackBuffer == null)
574                     {
575                         // wait on fallbacks if we can
576                         // For fallback we may need a fallback buffer
577                         if (baseEncoder == null)
578                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
579                         else
580                             fallbackBuffer = baseEncoder.FallbackBuffer;
581
582                         // Set our internal fallback interesting things.
583                         fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
584                     }
585
586                     // Do our fallback.  Actually we already know its a mixed up surrogate,
587                     // so the ref pSrc isn't gonna do anything.
588                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
589                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
590                     pSrc = pSrcForFallback;
591
592                     // Ignore it if we don't throw (we had preallocated this ch)
593                     byteCount--;
594                     ch = 0;
595                     continue;
596                 }
597
598                 // Count them
599                 if (ch > 0x7F)
600                 {
601                     if (ch > 0x7FF)
602                     {
603                         // the extra surrogate byte was compensated by the second surrogate character
604                         // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
605                         byteCount++;
606                     }
607                     byteCount++;
608                 }
609
610 #if BIT64
611                 // check for overflow
612                 if (byteCount < 0)
613                 {
614                     break;
615                 }
616 #endif
617
618 #if FASTLOOP
619                 // If still have fallback don't do fast loop
620                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
621                 {
622                     // We're reserving 1 byte for each char by default
623                     byteCount++;
624                     goto ProcessChar;
625                 }
626
627                 int availableChars = PtrDiff(pEnd, pSrc);
628
629                 // don't fall into the fast decoding loop if we don't have enough characters
630                 if (availableChars <= 13)
631                 {
632                     // try to get over the remainder of the ascii characters fast though
633                     char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
634                     while (pSrc < pLocalEnd)
635                     {
636                         ch = *pSrc;
637                         pSrc++;
638                         if (ch > 0x7F)
639                             goto ProcessChar;
640                     }
641
642                     // we are done
643                     break;
644                 }
645
646 #if BIT64
647                 // make sure that we won't get a silent overflow inside the fast loop
648                 // (Fall out to slow loop if we have this many characters)
649                 availableChars &= 0x0FFFFFFF;
650 #endif
651
652                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
653                 //  the boundary will be decreased for every non-ASCII character we encounter
654                 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
655                 char* pStop = pSrc + availableChars - (3 + 4);
656
657                 while (pSrc < pStop)
658                 {
659                     ch = *pSrc;
660                     pSrc++;
661
662                     if (ch > 0x7F)                                                  // Not ASCII
663                     {
664                         if (ch > 0x7FF)                                             // Not 2 Byte
665                         {
666                             if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
667                                 goto LongCode;
668                             byteCount++;
669                         }
670                         byteCount++;
671                     }
672
673                     // get pSrc aligned
674                     if ((unchecked((int)pSrc) & 0x2) != 0)
675                     {
676                         ch = *pSrc;
677                         pSrc++;
678                         if (ch > 0x7F)                                              // Not ASCII
679                         {
680                             if (ch > 0x7FF)                                         // Not 2 Byte
681                             {
682                                 if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
683                                     goto LongCode;
684                                 byteCount++;
685                             }
686                             byteCount++;
687                         }
688                     }
689
690                     // Run 2 * 4 characters at a time!
691                     while (pSrc < pStop)
692                     {
693                         ch = *(int*)pSrc;
694                         int chc = *(int*)(pSrc + 2);
695                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
696                         {
697                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
698                             {
699                                 goto LongCodeWithMask;
700                             }
701
702
703                             if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
704                                 byteCount++;
705                             if ((ch & unchecked((int)0xFF80)) != 0)
706                                 byteCount++;
707                             if ((chc & unchecked((int)0xFF800000)) != 0)
708                                 byteCount++;
709                             if ((chc & unchecked((int)0xFF80)) != 0)
710                                 byteCount++;
711                         }
712                         pSrc += 4;
713
714                         ch = *(int*)pSrc;
715                         chc = *(int*)(pSrc + 2);
716                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
717                         {
718                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
719                             {
720                                 goto LongCodeWithMask;
721                             }
722
723                             if ((ch & unchecked((int)0xFF800000)) != 0)
724                                 byteCount++;
725                             if ((ch & unchecked((int)0xFF80)) != 0)
726                                 byteCount++;
727                             if ((chc & unchecked((int)0xFF800000)) != 0)
728                                 byteCount++;
729                             if ((chc & unchecked((int)0xFF80)) != 0)
730                                 byteCount++;
731                         }
732                         pSrc += 4;
733                     }
734                     break;
735
736                 LongCodeWithMask:
737 #if BIGENDIAN
738                     // be careful about the sign extension
739                     ch = (int)(((uint)ch) >> 16);
740 #else // BIGENDIAN
741                     ch = (char)ch;
742 #endif // BIGENDIAN
743                     pSrc++;
744
745                     if (ch <= 0x7F)
746                     {
747                         continue;
748                     }
749
750                 LongCode:
751                     // use separate helper variables for slow and fast loop so that the jit optimizations
752                     // won't get confused about the variable lifetimes
753                     if (ch > 0x7FF)
754                     {
755                         // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
756                         if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
757                         {
758                             // 4 byte encoding - high surrogate + low surrogate
759
760                             int chd = *pSrc;
761                             if (
762                                 // !IsHighSurrogate(ch) // low without high -> bad
763                                 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
764                                 // !IsLowSurrogate(chd) // high not followed by low -> bad
765                                 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
766                             {
767                                 // Back up and drop out to slow loop to figure out error
768                                 pSrc--;
769                                 break;
770                             }
771                             pSrc++;
772
773                             // byteCount - this byte is compensated by the second surrogate character
774                         }
775                         byteCount++;
776                     }
777                     byteCount++;
778
779                     // byteCount - the last byte is already included
780                 }
781 #endif // FASTLOOP
782
783                 // no pending char at this point
784                 ch = 0;
785             }
786
787 #if BIT64
788             // check for overflow
789             if (byteCount < 0)
790             {
791                 throw new ArgumentException(
792                         SR.Argument_ConversionOverflow);
793             }
794 #endif
795
796             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
797                 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
798
799             return byteCount;
800         }
801
802         // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
803         // is good enough for us, and it tends to generate better code than the signed
804         // arithmetic generated by default
805         unsafe private static int PtrDiff(char* a, char* b)
806         {
807             return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
808         }
809
810         // byte* flavor just for parity
811         unsafe private static int PtrDiff(byte* a, byte* b)
812         {
813             return (int)(a - b);
814         }
815
816         private static bool InRange(int ch, int start, int end)
817         {
818             return (uint)(ch - start) <= (uint)(end - start);
819         }
820
821         // Our workhorse
822         // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
823         internal override unsafe int GetBytes(char* chars, int charCount,
824                                                 byte* bytes, int byteCount, EncoderNLS baseEncoder)
825         {
826             Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
827             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
828             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
829             Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
830
831             UTF8Encoder encoder = null;
832
833             // For fallback we may need a fallback buffer.
834             // We wait to initialize it though in case we don't have any broken input unicode
835             EncoderFallbackBuffer fallbackBuffer = null;
836             char* pSrcForFallback;
837
838             char* pSrc = chars;
839             byte* pTarget = bytes;
840
841             char* pEnd = pSrc + charCount;
842             byte* pAllocatedBufferEnd = pTarget + byteCount;
843
844             int ch = 0;
845
846             // assume that JIT will en-register pSrc, pTarget and ch
847
848             if (baseEncoder != null)
849             {
850                 encoder = (UTF8Encoder)baseEncoder;
851                 ch = encoder.surrogateChar;
852
853                 // We mustn't have left over fallback data when counting
854                 if (encoder.InternalHasFallbackBuffer)
855                 {
856                     // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
857                     fallbackBuffer = encoder.FallbackBuffer;
858                     if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
859                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
860
861                     // Set our internal fallback interesting things.
862                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
863                 }
864             }
865
866             for (;;)
867             {
868                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
869
870                 if (pSrc >= pEnd)
871                 {
872                     if (ch == 0)
873                     {
874                         // Check if there's anthing left to get out of the fallback buffer
875                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
876                         if (ch > 0)
877                         {
878                             goto ProcessChar;
879                         }
880                     }
881                     else
882                     {
883                         // Case of leftover surrogates in the fallback buffer
884                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
885                         {
886                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
887                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
888
889                             int cha = ch;
890
891                             ch = fallbackBuffer.InternalGetNextChar();
892
893                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
894                             {
895                                 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
896                                 goto EncodeChar;
897                             }
898                             else if (ch > 0)
899                             {
900                                 goto ProcessChar;
901                             }
902                             else
903                             {
904                                 break;
905                             }
906                         }
907                     }
908
909                     // attempt to encode the partial surrogate (will fail or ignore)
910                     if (ch > 0 && (encoder == null || encoder.MustFlush))
911                         goto EncodeChar;
912
913                     // We're done
914                     break;
915                 }
916
917                 if (ch > 0)
918                 {
919                     // We have a high surrogate left over from a previous loop.
920                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
921                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
922
923                     // use separate helper variables for local contexts so that the jit optimizations
924                     // won't get confused about the variable lifetimes
925                     int cha = *pSrc;
926
927                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
928                     // if (IsLowSurrogate(cha)) {
929                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
930                     {
931                         ch = cha + (ch << 10) +
932                             (0x10000
933                             - CharUnicodeInfo.LOW_SURROGATE_START
934                             - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
935
936                         pSrc++;
937                     }
938                     // else ch is still high surrogate and encoding will fail
939
940                     // attempt to encode the surrogate or partial surrogate
941                     goto EncodeChar;
942                 }
943
944                 // If we've used a fallback, then we have to check for it
945                 if (fallbackBuffer != null)
946                 {
947                     ch = fallbackBuffer.InternalGetNextChar();
948                     if (ch > 0) goto ProcessChar;
949                 }
950
951                 // read next char. The JIT optimization seems to be getting confused when
952                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
953                 ch = *pSrc;
954                 pSrc++;
955
956             ProcessChar:
957                 // if (IsHighSurrogate(ch)) {
958                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
959                 {
960                     continue;
961                 }
962             // either good char or partial surrogate
963
964             EncodeChar:
965                 // throw exception on partial surrogate if necessary
966                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
967                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
968                 {
969                     // Lone surrogates aren't allowed, we have to do fallback for them
970                     // Have to make a fallback buffer if we don't have one
971                     if (fallbackBuffer == null)
972                     {
973                         // wait on fallbacks if we can
974                         // For fallback we may need a fallback buffer
975                         if (baseEncoder == null)
976                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
977                         else
978                             fallbackBuffer = baseEncoder.FallbackBuffer;
979
980                         // Set our internal fallback interesting things.
981                         fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
982                     }
983
984                     // Do our fallback.  Actually we already know its a mixed up surrogate,
985                     // so the ref pSrc isn't gonna do anything.
986                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
987                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
988                     pSrc = pSrcForFallback;
989
990                     // Ignore it if we don't throw
991                     ch = 0;
992                     continue;
993                 }
994
995                 // Count bytes needed
996                 int bytesNeeded = 1;
997                 if (ch > 0x7F)
998                 {
999                     if (ch > 0x7FF)
1000                     {
1001                         if (ch > 0xFFFF)
1002                         {
1003                             bytesNeeded++;  // 4 bytes (surrogate pair)
1004                         }
1005                         bytesNeeded++;      // 3 bytes (800-FFFF)
1006                     }
1007                     bytesNeeded++;          // 2 bytes (80-7FF)
1008                 }
1009
1010                 if (pTarget > pAllocatedBufferEnd - bytesNeeded)
1011                 {
1012                     // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1013                     if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1014                     {
1015                         fallbackBuffer.MovePrevious();              // Didn't use this fallback char
1016                         if (ch > 0xFFFF)
1017                             fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
1018                     }
1019                     else
1020                     {
1021                         pSrc--;                                     // Didn't use this char
1022                         if (ch > 0xFFFF)
1023                             pSrc--;                                 // Was surrogate, didn't use 2nd part either
1024                     }
1025                     Debug.Assert(pSrc >= chars || pTarget == bytes,
1026                         "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1027                     ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
1028                     ch = 0;                                         // Nothing left over (we backed up to start of pair if supplementary)
1029                     break;
1030                 }
1031
1032                 if (ch <= 0x7F)
1033                 {
1034                     *pTarget = (byte)ch;
1035                 }
1036                 else
1037                 {
1038                     // use separate helper variables for local contexts so that the jit optimizations
1039                     // won't get confused about the variable lifetimes
1040                     int chb;
1041                     if (ch <= 0x7FF)
1042                     {
1043                         // 2 byte encoding
1044                         chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1045                     }
1046                     else
1047                     {
1048                         if (ch <= 0xFFFF)
1049                         {
1050                             chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1051                         }
1052                         else
1053                         {
1054                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1055                             pTarget++;
1056
1057                             chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1058                         }
1059                         *pTarget = (byte)chb;
1060                         pTarget++;
1061
1062                         chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1063                     }
1064                     *pTarget = (byte)chb;
1065                     pTarget++;
1066
1067                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1068                 }
1069                 pTarget++;
1070
1071
1072 #if FASTLOOP
1073                 // If still have fallback don't do fast loop
1074                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1075                     goto ProcessChar;
1076
1077                 int availableChars = PtrDiff(pEnd, pSrc);
1078                 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1079
1080                 // don't fall into the fast decoding loop if we don't have enough characters
1081                 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1082                 if (availableChars <= 13)
1083                 {
1084                     // we are hoping for 1 byte per char
1085                     if (availableBytes < availableChars)
1086                     {
1087                         // not enough output room.  no pending bits at this point
1088                         ch = 0;
1089                         continue;
1090                     }
1091
1092                     // try to get over the remainder of the ascii characters fast though
1093                     char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
1094                     while (pSrc < pLocalEnd)
1095                     {
1096                         ch = *pSrc;
1097                         pSrc++;
1098
1099                         // Not ASCII, need more than 1 byte per char
1100                         if (ch > 0x7F)
1101                             goto ProcessChar;
1102
1103                         *pTarget = (byte)ch;
1104                         pTarget++;
1105                     }
1106                     // we are done, let ch be 0 to clear encoder
1107                     ch = 0;
1108                     break;
1109                 }
1110
1111                 // we need at least 1 byte per character, but Convert might allow us to convert
1112                 // only part of the input, so try as much as we can.  Reduce charCount if necessary
1113                 if (availableBytes < availableChars)
1114                 {
1115                     availableChars = availableBytes;
1116                 }
1117
1118                 // FASTLOOP:
1119                 // - optimistic range checks
1120                 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1121
1122                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1123                 //  the boundary will be decreased for every non-ASCII character we encounter
1124                 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1125                 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1126                 char* pStop = pSrc + availableChars - 5;
1127
1128                 while (pSrc < pStop)
1129                 {
1130                     ch = *pSrc;
1131                     pSrc++;
1132
1133                     if (ch > 0x7F)
1134                     {
1135                         goto LongCode;
1136                     }
1137                     *pTarget = (byte)ch;
1138                     pTarget++;
1139
1140                     // get pSrc aligned
1141                     if ((unchecked((int)pSrc) & 0x2) != 0)
1142                     {
1143                         ch = *pSrc;
1144                         pSrc++;
1145                         if (ch > 0x7F)
1146                         {
1147                             goto LongCode;
1148                         }
1149                         *pTarget = (byte)ch;
1150                         pTarget++;
1151                     }
1152
1153                     // Run 4 characters at a time!
1154                     while (pSrc < pStop)
1155                     {
1156                         ch = *(int*)pSrc;
1157                         int chc = *(int*)(pSrc + 2);
1158                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
1159                         {
1160                             goto LongCodeWithMask;
1161                         }
1162
1163                         // Unfortunately, this is endianess sensitive
1164 #if BIGENDIAN
1165                         *pTarget = (byte)(ch>>16);
1166                         *(pTarget+1) = (byte)ch;
1167                         pSrc += 4;
1168                         *(pTarget+2) = (byte)(chc>>16);
1169                         *(pTarget+3) = (byte)chc;
1170                         pTarget += 4;
1171 #else // BIGENDIAN
1172                         *pTarget = (byte)ch;
1173                         *(pTarget + 1) = (byte)(ch >> 16);
1174                         pSrc += 4;
1175                         *(pTarget + 2) = (byte)chc;
1176                         *(pTarget + 3) = (byte)(chc >> 16);
1177                         pTarget += 4;
1178 #endif // BIGENDIAN
1179                     }
1180                     continue;
1181
1182                 LongCodeWithMask:
1183 #if BIGENDIAN
1184                     // be careful about the sign extension
1185                     ch = (int)(((uint)ch) >> 16);
1186 #else // BIGENDIAN
1187                     ch = (char)ch;
1188 #endif // BIGENDIAN
1189                     pSrc++;
1190
1191                     if (ch > 0x7F)
1192                     {
1193                         goto LongCode;
1194                     }
1195                     *pTarget = (byte)ch;
1196                     pTarget++;
1197                     continue;
1198
1199                 LongCode:
1200                     // use separate helper variables for slow and fast loop so that the jit optimizations
1201                     // won't get confused about the variable lifetimes
1202                     int chd;
1203                     if (ch <= 0x7FF)
1204                     {
1205                         // 2 byte encoding
1206                         chd = unchecked((sbyte)0xC0) | (ch >> 6);
1207                     }
1208                     else
1209                     {
1210                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1211                         if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1212                         {
1213                             // 3 byte encoding
1214                             chd = unchecked((sbyte)0xE0) | (ch >> 12);
1215                         }
1216                         else
1217                         {
1218                             // 4 byte encoding - high surrogate + low surrogate
1219                             // if (!IsHighSurrogate(ch))
1220                             if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
1221                             {
1222                                 // low without high -> bad, try again in slow loop
1223                                 pSrc -= 1;
1224                                 break;
1225                             }
1226
1227                             chd = *pSrc;
1228                             pSrc++;
1229
1230                             // if (!IsLowSurrogate(chd)) {
1231                             if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1232                             {
1233                                 // high not followed by low -> bad, try again in slow loop
1234                                 pSrc -= 2;
1235                                 break;
1236                             }
1237
1238                             ch = chd + (ch << 10) +
1239                                 (0x10000
1240                                 - CharUnicodeInfo.LOW_SURROGATE_START
1241                                 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
1242
1243                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1244                             // pStop - this byte is compensated by the second surrogate character
1245                             // 2 input chars require 4 output bytes.  2 have been anticipated already
1246                             // and 2 more will be accounted for by the 2 pStop-- calls below.
1247                             pTarget++;
1248
1249                             chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1250                         }
1251                         *pTarget = (byte)chd;
1252                         pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1253                         pTarget++;
1254
1255                         chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1256                     }
1257                     *pTarget = (byte)chd;
1258                     pStop--;                        // 2 byte sequence for 1 char so need pStop--.
1259                     pTarget++;
1260
1261                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1262                     // pStop - this byte is already included
1263                     pTarget++;
1264                 }
1265
1266                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1267
1268 #endif // FASTLOOP
1269
1270                 // no pending char at this point
1271                 ch = 0;
1272             }
1273
1274             // Do we have to set the encoder bytes?
1275             if (encoder != null)
1276             {
1277                 Debug.Assert(!encoder.MustFlush || ch == 0,
1278                     "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1279
1280                 encoder.surrogateChar = ch;
1281                 encoder._charsUsed = (int)(pSrc - chars);
1282             }
1283
1284             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1285                 baseEncoder == null || !baseEncoder._throwOnOverflow,
1286                 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1287
1288             return (int)(pTarget - bytes);
1289         }
1290
1291
1292         // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1293         // while the actual character is being built in the lower bits. They are shifted together
1294         // with the actual bits of the character.
1295
1296         // bits 30 & 31 are used for pending bits fixup
1297         private const int FinalByte = 1 << 29;
1298         private const int SupplimentarySeq = 1 << 28;
1299         private const int ThreeByteSeq = 1 << 27;
1300
1301         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1302         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1303         //
1304         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1305         // kept the same as much as possible
1306         internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1307         {
1308             Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1309             Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
1310
1311             // Initialize stuff
1312             byte* pSrc = bytes;
1313             byte* pEnd = pSrc + count;
1314
1315             // Start by assuming we have as many as count, charCount always includes the adjustment
1316             // for the character being decoded
1317             int charCount = count;
1318             int ch = 0;
1319             DecoderFallbackBuffer fallback = null;
1320
1321             if (baseDecoder != null)
1322             {
1323                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1324                 ch = decoder.bits;
1325                 charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
1326
1327                 // Shouldn't have anything in fallback buffer for GetCharCount
1328                 // (don't have to check _throwOnOverflow for count)
1329                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1330                     "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1331             }
1332
1333             for (;;)
1334             {
1335                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1336
1337                 if (pSrc >= pEnd)
1338                 {
1339                     break;
1340                 }
1341
1342                 if (ch == 0)
1343                 {
1344                     // no pending bits
1345                     goto ReadChar;
1346                 }
1347
1348                 // read next byte. The JIT optimization seems to be getting confused when
1349                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1350                 int cha = *pSrc;
1351                 pSrc++;
1352
1353                 // we are expecting to see trailing bytes like 10vvvvvv
1354                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1355                 {
1356                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1357                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1358                     pSrc--;
1359                     charCount += (ch >> 30);
1360                     goto InvalidByteSequence;
1361                 }
1362
1363                 // fold in the new byte
1364                 ch = (ch << 6) | (cha & 0x3F);
1365
1366                 if ((ch & FinalByte) == 0)
1367                 {
1368                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1369                         "[UTF8Encoding.GetChars]Invariant volation");
1370
1371                     if ((ch & SupplimentarySeq) != 0)
1372                     {
1373                         if ((ch & (FinalByte >> 6)) != 0)
1374                         {
1375                             // this is 3rd byte (of 4 byte supplementary) - nothing to do
1376                             continue;
1377                         }
1378
1379                         // 2nd byte, check for non-shortest form of supplementary char and the valid
1380                         // supplementary characters in range 0x010000 - 0x10FFFF at the same time
1381                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1382                         {
1383                             goto InvalidByteSequence;
1384                         }
1385                     }
1386                     else
1387                     {
1388                         // Must be 2nd byte of a 3-byte sequence
1389                         // check for non-shortest form of 3 byte seq
1390                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1391                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1392                         {
1393                             goto InvalidByteSequence;
1394                         }
1395                     }
1396                     continue;
1397                 }
1398
1399                 // ready to punch
1400
1401                 // adjust for surrogates in non-shortest form
1402                 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
1403                 {
1404                     charCount--;
1405                 }
1406                 goto EncodeChar;
1407
1408             InvalidByteSequence:
1409                 // this code fragment should be close to the goto referencing it
1410                 // Have to do fallback for invalid bytes
1411                 if (fallback == null)
1412                 {
1413                     if (baseDecoder == null)
1414                         fallback = this.decoderFallback.CreateFallbackBuffer();
1415                     else
1416                         fallback = baseDecoder.FallbackBuffer;
1417                     fallback.InternalInitialize(bytes, null);
1418                 }
1419                 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1420
1421                 ch = 0;
1422                 continue;
1423
1424             ReadChar:
1425                 ch = *pSrc;
1426                 pSrc++;
1427
1428             ProcessChar:
1429                 if (ch > 0x7F)
1430                 {
1431                     // If its > 0x7F, its start of a new multi-byte sequence
1432
1433                     // Long sequence, so unreserve our char.
1434                     charCount--;
1435
1436                     // bit 6 has to be non-zero for start of multibyte chars.
1437                     if ((ch & 0x40) == 0)
1438                     {
1439                         // Unexpected trail byte
1440                         goto InvalidByteSequence;
1441                     }
1442
1443                     // start a new long code
1444                     if ((ch & 0x20) != 0)
1445                     {
1446                         if ((ch & 0x10) != 0)
1447                         {
1448                             // 4 byte encoding - supplimentary character (2 surrogates)
1449
1450                             ch &= 0x0F;
1451
1452                             // check that bit 4 is zero and the valid supplimentary character
1453                             // range 0x000000 - 0x10FFFF at the same time
1454                             if (ch > 0x04)
1455                             {
1456                                 ch |= 0xf0;
1457                                 goto InvalidByteSequence;
1458                             }
1459
1460                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1461                             // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1462                             ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
1463                                   (1 << 30) |           // If it dies on next byte we'll need an extra char
1464                                   (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
1465                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1466                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1467
1468                             // Our character count will be 2 characters for these 4 bytes, so subtract another char
1469                             charCount--;
1470                         }
1471                         else
1472                         {
1473                             // 3 byte encoding
1474                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1475                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1476                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1477
1478                             // We'll expect 1 character for these 3 bytes, so subtract another char.
1479                             charCount--;
1480                         }
1481                     }
1482                     else
1483                     {
1484                         // 2 byte encoding
1485
1486                         ch &= 0x1F;
1487
1488                         // check for non-shortest form
1489                         if (ch <= 1)
1490                         {
1491                             ch |= 0xc0;
1492                             goto InvalidByteSequence;
1493                         }
1494
1495                         // Add bit flags so we'll be flagged correctly
1496                         ch |= (FinalByte >> 6);
1497                     }
1498                     continue;
1499                 }
1500
1501             EncodeChar:
1502
1503 #if FASTLOOP
1504                 int availableBytes = PtrDiff(pEnd, pSrc);
1505
1506                 // don't fall into the fast decoding loop if we don't have enough bytes
1507                 if (availableBytes <= 13)
1508                 {
1509                     // try to get over the remainder of the ascii characters fast though
1510                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
1511                     while (pSrc < pLocalEnd)
1512                     {
1513                         ch = *pSrc;
1514                         pSrc++;
1515
1516                         if (ch > 0x7F)
1517                             goto ProcessChar;
1518                     }
1519                     // we are done
1520                     ch = 0;
1521                     break;
1522                 }
1523
1524                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1525                 //  the boundary will be decreased for every non-ASCII character we encounter
1526                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1527                 byte* pStop = pSrc + availableBytes - 7;
1528
1529                 while (pSrc < pStop)
1530                 {
1531                     ch = *pSrc;
1532                     pSrc++;
1533
1534                     if (ch > 0x7F)
1535                     {
1536                         goto LongCode;
1537                     }
1538
1539                     // get pSrc 2-byte aligned
1540                     if ((unchecked((int)pSrc) & 0x1) != 0)
1541                     {
1542                         ch = *pSrc;
1543                         pSrc++;
1544                         if (ch > 0x7F)
1545                         {
1546                             goto LongCode;
1547                         }
1548                     }
1549
1550                     // get pSrc 4-byte aligned
1551                     if ((unchecked((int)pSrc) & 0x2) != 0)
1552                     {
1553                         ch = *(ushort*)pSrc;
1554                         if ((ch & 0x8080) != 0)
1555                         {
1556                             goto LongCodeWithMask16;
1557                         }
1558                         pSrc += 2;
1559                     }
1560
1561                     // Run 8 + 8 characters at a time!
1562                     while (pSrc < pStop)
1563                     {
1564                         ch = *(int*)pSrc;
1565                         int chb = *(int*)(pSrc + 4);
1566                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1567                         {
1568                             goto LongCodeWithMask32;
1569                         }
1570                         pSrc += 8;
1571
1572                         // This is a really small loop - unroll it
1573                         if (pSrc >= pStop)
1574                             break;
1575
1576                         ch = *(int*)pSrc;
1577                         chb = *(int*)(pSrc + 4);
1578                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1579                         {
1580                             goto LongCodeWithMask32;
1581                         }
1582                         pSrc += 8;
1583                     }
1584                     break;
1585
1586 #if BIGENDIAN
1587                 LongCodeWithMask32:
1588                     // be careful about the sign extension
1589                     ch = (int)(((uint)ch) >> 16);
1590                 LongCodeWithMask16:
1591                     ch = (int)(((uint)ch) >> 8);
1592 #else // BIGENDIAN
1593                 LongCodeWithMask32:
1594                 LongCodeWithMask16:
1595                     ch &= 0xFF;
1596 #endif // BIGENDIAN
1597                     pSrc++;
1598                     if (ch <= 0x7F)
1599                     {
1600                         continue;
1601                     }
1602
1603                 LongCode:
1604                     int chc = *pSrc;
1605                     pSrc++;
1606
1607                     if (
1608                         // bit 6 has to be zero
1609                         (ch & 0x40) == 0 ||
1610                         // we are expecting to see trailing bytes like 10vvvvvv
1611                         (chc & unchecked((sbyte)0xC0)) != 0x80)
1612                     {
1613                         goto BadLongCode;
1614                     }
1615
1616                     chc &= 0x3F;
1617
1618                     // start a new long code
1619                     if ((ch & 0x20) != 0)
1620                     {
1621                         // fold the first two bytes together
1622                         chc |= (ch & 0x0F) << 6;
1623
1624                         if ((ch & 0x10) != 0)
1625                         {
1626                             // 4 byte encoding - surrogate
1627                             ch = *pSrc;
1628                             if (
1629                                 // check that bit 4 is zero, the non-shortest form of surrogate
1630                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1631                                 !InRange(chc >> 4, 0x01, 0x10) ||
1632                                 // we are expecting to see trailing bytes like 10vvvvvv
1633                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1634                             {
1635                                 goto BadLongCode;
1636                             }
1637
1638                             chc = (chc << 6) | (ch & 0x3F);
1639
1640                             ch = *(pSrc + 1);
1641                             // we are expecting to see trailing bytes like 10vvvvvv
1642                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
1643                             {
1644                                 goto BadLongCode;
1645                             }
1646                             pSrc += 2;
1647
1648                             // extra byte
1649                             charCount--;
1650                         }
1651                         else
1652                         {
1653                             // 3 byte encoding
1654                             ch = *pSrc;
1655                             if (
1656                                 // check for non-shortest form of 3 byte seq
1657                                 (chc & (0x1F << 5)) == 0 ||
1658                                 // Can't have surrogates here.
1659                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1660                                 // we are expecting to see trailing bytes like 10vvvvvv
1661                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1662                             {
1663                                 goto BadLongCode;
1664                             }
1665                             pSrc++;
1666
1667                             // extra byte
1668                             charCount--;
1669                         }
1670                     }
1671                     else
1672                     {
1673                         // 2 byte encoding
1674
1675                         // check for non-shortest form
1676                         if ((ch & 0x1E) == 0)
1677                         {
1678                             goto BadLongCode;
1679                         }
1680                     }
1681
1682                     // extra byte
1683                     charCount--;
1684                 }
1685 #endif // FASTLOOP
1686
1687                 // no pending bits at this point
1688                 ch = 0;
1689                 continue;
1690
1691             BadLongCode:
1692                 pSrc -= 2;
1693                 ch = 0;
1694                 continue;
1695             }
1696
1697             // May have a problem if we have to flush
1698             if (ch != 0)
1699             {
1700                 // We were already adjusting for these, so need to un-adjust
1701                 charCount += (ch >> 30);
1702                 if (baseDecoder == null || baseDecoder.MustFlush)
1703                 {
1704                     // Have to do fallback for invalid bytes
1705                     if (fallback == null)
1706                     {
1707                         if (baseDecoder == null)
1708                             fallback = this.decoderFallback.CreateFallbackBuffer();
1709                         else
1710                             fallback = baseDecoder.FallbackBuffer;
1711                         fallback.InternalInitialize(bytes, null);
1712                     }
1713                     charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1714                 }
1715             }
1716
1717             // Shouldn't have anything in fallback buffer for GetCharCount
1718             // (don't have to check _throwOnOverflow for count)
1719             Debug.Assert(fallback == null || fallback.Remaining == 0,
1720                 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1721
1722             return charCount;
1723         }
1724
1725         // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
1726         //           So if we're really broken, then that could also throw an error... recursively.
1727         //           So try to make sure GetChars can at least process all uses by
1728         //           System.Resources.ResourceReader!
1729         //
1730         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1731         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1732         //
1733         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1734         // kept the same as much as possible
1735         internal override unsafe int GetChars(byte* bytes, int byteCount,
1736                                                 char* chars, int charCount, DecoderNLS baseDecoder)
1737         {
1738             Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
1739             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
1740             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1741             Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
1742
1743             byte* pSrc = bytes;
1744             char* pTarget = chars;
1745
1746             byte* pEnd = pSrc + byteCount;
1747             char* pAllocatedBufferEnd = pTarget + charCount;
1748
1749             int ch = 0;
1750
1751             DecoderFallbackBuffer fallback = null;
1752             byte* pSrcForFallback;
1753             char* pTargetForFallback;
1754             if (baseDecoder != null)
1755             {
1756                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1757                 ch = decoder.bits;
1758
1759                 // Shouldn't have anything in fallback buffer for GetChars
1760                 // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty)
1761                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1762                     "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1763             }
1764
1765             for (;;)
1766             {
1767                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1768
1769                 if (pSrc >= pEnd)
1770                 {
1771                     break;
1772                 }
1773
1774                 if (ch == 0)
1775                 {
1776                     // no pending bits
1777                     goto ReadChar;
1778                 }
1779
1780                 // read next byte. The JIT optimization seems to be getting confused when
1781                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1782                 int cha = *pSrc;
1783                 pSrc++;
1784
1785                 // we are expecting to see trailing bytes like 10vvvvvv
1786                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1787                 {
1788                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1789                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1790                     pSrc--;
1791                     goto InvalidByteSequence;
1792                 }
1793
1794                 // fold in the new byte
1795                 ch = (ch << 6) | (cha & 0x3F);
1796
1797                 if ((ch & FinalByte) == 0)
1798                 {
1799                     // Not at last byte yet
1800                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1801                         "[UTF8Encoding.GetChars]Invariant volation");
1802
1803                     if ((ch & SupplimentarySeq) != 0)
1804                     {
1805                         // Its a 4-byte supplimentary sequence
1806                         if ((ch & (FinalByte >> 6)) != 0)
1807                         {
1808                             // this is 3rd byte of 4 byte sequence - nothing to do
1809                             continue;
1810                         }
1811
1812                         // 2nd byte of 4 bytes
1813                         // check for non-shortest form of surrogate and the valid surrogate
1814                         // range 0x000000 - 0x10FFFF at the same time
1815                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1816                         {
1817                             goto InvalidByteSequence;
1818                         }
1819                     }
1820                     else
1821                     {
1822                         // Must be 2nd byte of a 3-byte sequence
1823                         // check for non-shortest form of 3 byte seq
1824                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1825                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1826                         {
1827                             goto InvalidByteSequence;
1828                         }
1829                     }
1830                     continue;
1831                 }
1832
1833                 // ready to punch
1834
1835                 // surrogate in shortest form?
1836                 // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1837                 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
1838                 {
1839                     // let the range check for the second char throw the exception
1840                     if (pTarget < pAllocatedBufferEnd)
1841                     {
1842                         *pTarget = (char)(((ch >> 10) & 0x7FF) +
1843                             unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
1844                         pTarget++;
1845
1846                         ch = (ch & 0x3FF) +
1847                             unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1848                     }
1849                 }
1850
1851                 goto EncodeChar;
1852
1853             InvalidByteSequence:
1854                 // this code fragment should be close to the gotos referencing it
1855                 // Have to do fallback for invalid bytes
1856                 if (fallback == null)
1857                 {
1858                     if (baseDecoder == null)
1859                         fallback = this.decoderFallback.CreateFallbackBuffer();
1860                     else
1861                         fallback = baseDecoder.FallbackBuffer;
1862                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1863                 }
1864                 // That'll back us up the appropriate # of bytes if we didn't get anywhere
1865                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
1866                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
1867                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
1868                 pSrc = pSrcForFallback;
1869                 pTarget = pTargetForFallback;
1870
1871                 if (!fallbackResult)
1872                 {
1873                     // Ran out of buffer space
1874                     // Need to throw an exception?
1875                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1876                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1877                     fallback.InternalReset();
1878                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1879                     ch = 0;
1880                     break;
1881                 }
1882                 Debug.Assert(pSrc >= bytes,
1883                     "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1884                 ch = 0;
1885                 continue;
1886
1887             ReadChar:
1888                 ch = *pSrc;
1889                 pSrc++;
1890
1891             ProcessChar:
1892                 if (ch > 0x7F)
1893                 {
1894                     // If its > 0x7F, its start of a new multi-byte sequence
1895
1896                     // bit 6 has to be non-zero
1897                     if ((ch & 0x40) == 0)
1898                     {
1899                         goto InvalidByteSequence;
1900                     }
1901
1902                     // start a new long code
1903                     if ((ch & 0x20) != 0)
1904                     {
1905                         if ((ch & 0x10) != 0)
1906                         {
1907                             // 4 byte encoding - supplimentary character (2 surrogates)
1908
1909                             ch &= 0x0F;
1910
1911                             // check that bit 4 is zero and the valid supplimentary character
1912                             // range 0x000000 - 0x10FFFF at the same time
1913                             if (ch > 0x04)
1914                             {
1915                                 ch |= 0xf0;
1916                                 goto InvalidByteSequence;
1917                             }
1918
1919                             ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1920                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1921                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1922                         }
1923                         else
1924                         {
1925                             // 3 byte encoding
1926                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1927                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1928                         }
1929                     }
1930                     else
1931                     {
1932                         // 2 byte encoding
1933
1934                         ch &= 0x1F;
1935
1936                         // check for non-shortest form
1937                         if (ch <= 1)
1938                         {
1939                             ch |= 0xc0;
1940                             goto InvalidByteSequence;
1941                         }
1942
1943                         ch |= (FinalByte >> 6);
1944                     }
1945                     continue;
1946                 }
1947
1948             EncodeChar:
1949                 // write the pending character
1950                 if (pTarget >= pAllocatedBufferEnd)
1951                 {
1952                     // Fix chars so we make sure to throw if we didn't output anything
1953                     ch &= 0x1fffff;
1954                     if (ch > 0x7f)
1955                     {
1956                         if (ch > 0x7ff)
1957                         {
1958                             if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1959                                 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1960                             {
1961                                 pSrc--;     // It was 4 bytes
1962                                 pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
1963                             }
1964                             else if (ch > 0xffff)
1965                             {
1966                                 pSrc--;     // It was 4 bytes, nothing was stored
1967                             }
1968                             pSrc--;         // It was at least 3 bytes
1969                         }
1970                         pSrc--;             // It was at least 2 bytes
1971                     }
1972                     pSrc--;
1973
1974                     // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1975                     // a 4 byte sequence already)
1976                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1977                         "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1978                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1979
1980                     // Don't store ch in decoder, we already backed up to its start
1981                     ch = 0;
1982
1983                     // Didn't throw, just use this buffer size.
1984                     break;
1985                 }
1986                 *pTarget = (char)ch;
1987                 pTarget++;
1988
1989 #if FASTLOOP
1990                 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1991                 int availableBytes = PtrDiff(pEnd, pSrc);
1992
1993                 // don't fall into the fast decoding loop if we don't have enough bytes
1994                 // Test for availableChars is done because pStop would be <= pTarget.
1995                 if (availableBytes <= 13)
1996                 {
1997                     // we may need as many as 1 character per byte
1998                     if (availableChars < availableBytes)
1999                     {
2000                         // not enough output room.  no pending bits at this point
2001                         ch = 0;
2002                         continue;
2003                     }
2004
2005                     // try to get over the remainder of the ascii characters fast though
2006                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2007                     while (pSrc < pLocalEnd)
2008                     {
2009                         ch = *pSrc;
2010                         pSrc++;
2011
2012                         if (ch > 0x7F)
2013                             goto ProcessChar;
2014
2015                         *pTarget = (char)ch;
2016                         pTarget++;
2017                     }
2018                     // we are done
2019                     ch = 0;
2020                     break;
2021                 }
2022
2023                 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
2024                 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
2025                 if (availableChars < availableBytes)
2026                 {
2027                     availableBytes = availableChars;
2028                 }
2029
2030                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2031                 //  the boundary will be decreased for every non-ASCII character we encounter
2032                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
2033                 char* pStop = pTarget + availableBytes - 7;
2034
2035                 while (pTarget < pStop)
2036                 {
2037                     ch = *pSrc;
2038                     pSrc++;
2039
2040                     if (ch > 0x7F)
2041                     {
2042                         goto LongCode;
2043                     }
2044                     *pTarget = (char)ch;
2045                     pTarget++;
2046
2047                     // get pSrc to be 2-byte aligned
2048                     if ((unchecked((int)pSrc) & 0x1) != 0)
2049                     {
2050                         ch = *pSrc;
2051                         pSrc++;
2052                         if (ch > 0x7F)
2053                         {
2054                             goto LongCode;
2055                         }
2056                         *pTarget = (char)ch;
2057                         pTarget++;
2058                     }
2059
2060                     // get pSrc to be 4-byte aligned
2061                     if ((unchecked((int)pSrc) & 0x2) != 0)
2062                     {
2063                         ch = *(ushort*)pSrc;
2064                         if ((ch & 0x8080) != 0)
2065                         {
2066                             goto LongCodeWithMask16;
2067                         }
2068
2069                         // Unfortunately, this is endianess sensitive
2070 #if BIGENDIAN
2071                         *pTarget = (char)((ch >> 8) & 0x7F);
2072                         pSrc += 2;
2073                         *(pTarget+1) = (char)(ch & 0x7F);
2074                         pTarget += 2;
2075 #else // BIGENDIAN
2076                         *pTarget = (char)(ch & 0x7F);
2077                         pSrc += 2;
2078                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2079                         pTarget += 2;
2080 #endif // BIGENDIAN
2081                     }
2082
2083                     // Run 8 characters at a time!
2084                     while (pTarget < pStop)
2085                     {
2086                         ch = *(int*)pSrc;
2087                         int chb = *(int*)(pSrc + 4);
2088                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
2089                         {
2090                             goto LongCodeWithMask32;
2091                         }
2092
2093                         // Unfortunately, this is endianess sensitive
2094 #if BIGENDIAN
2095                         *pTarget = (char)((ch >> 24) & 0x7F);
2096                         *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2097                         *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2098                         *(pTarget+3) = (char)(ch & 0x7F);
2099                         pSrc += 8;
2100                         *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2101                         *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2102                         *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2103                         *(pTarget+7) = (char)(chb & 0x7F);
2104                         pTarget += 8;
2105 #else // BIGENDIAN
2106                         *pTarget = (char)(ch & 0x7F);
2107                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2108                         *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
2109                         *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
2110                         pSrc += 8;
2111                         *(pTarget + 4) = (char)(chb & 0x7F);
2112                         *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
2113                         *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
2114                         *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
2115                         pTarget += 8;
2116 #endif // BIGENDIAN
2117                     }
2118                     break;
2119
2120 #if BIGENDIAN
2121                 LongCodeWithMask32:
2122                     // be careful about the sign extension
2123                     ch = (int)(((uint)ch) >> 16);
2124                 LongCodeWithMask16:
2125                     ch = (int)(((uint)ch) >> 8);
2126 #else // BIGENDIAN
2127                 LongCodeWithMask32:
2128                 LongCodeWithMask16:
2129                     ch &= 0xFF;
2130 #endif // BIGENDIAN
2131                     pSrc++;
2132                     if (ch <= 0x7F)
2133                     {
2134                         *pTarget = (char)ch;
2135                         pTarget++;
2136                         continue;
2137                     }
2138
2139                 LongCode:
2140                     int chc = *pSrc;
2141                     pSrc++;
2142
2143                     if (
2144                         // bit 6 has to be zero
2145                         (ch & 0x40) == 0 ||
2146                         // we are expecting to see trailing bytes like 10vvvvvv
2147                         (chc & unchecked((sbyte)0xC0)) != 0x80)
2148                     {
2149                         goto BadLongCode;
2150                     }
2151
2152                     chc &= 0x3F;
2153
2154                     // start a new long code
2155                     if ((ch & 0x20) != 0)
2156                     {
2157                         // fold the first two bytes together
2158                         chc |= (ch & 0x0F) << 6;
2159
2160                         if ((ch & 0x10) != 0)
2161                         {
2162                             // 4 byte encoding - surrogate
2163                             ch = *pSrc;
2164                             if (
2165                                 // check that bit 4 is zero, the non-shortest form of surrogate
2166                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2167                                 !InRange(chc >> 4, 0x01, 0x10) ||
2168                                 // we are expecting to see trailing bytes like 10vvvvvv
2169                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2170                             {
2171                                 goto BadLongCode;
2172                             }
2173
2174                             chc = (chc << 6) | (ch & 0x3F);
2175
2176                             ch = *(pSrc + 1);
2177                             // we are expecting to see trailing bytes like 10vvvvvv
2178                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
2179                             {
2180                                 goto BadLongCode;
2181                             }
2182                             pSrc += 2;
2183
2184                             ch = (chc << 6) | (ch & 0x3F);
2185
2186                             *pTarget = (char)(((ch >> 10) & 0x7FF) +
2187                                 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
2188                             pTarget++;
2189
2190                             ch = (ch & 0x3FF) +
2191                                 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2192
2193                             // extra byte, we're already planning 2 chars for 2 of these bytes,
2194                             // but the big loop is testing the target against pStop, so we need
2195                             // to subtract 2 more or we risk overrunning the input.  Subtract 
2196                             // one here and one below.
2197                             pStop--;
2198                         }
2199                         else
2200                         {
2201                             // 3 byte encoding
2202                             ch = *pSrc;
2203                             if (
2204                                 // check for non-shortest form of 3 byte seq
2205                                 (chc & (0x1F << 5)) == 0 ||
2206                                 // Can't have surrogates here.
2207                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2208                                 // we are expecting to see trailing bytes like 10vvvvvv
2209                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2210                             {
2211                                 goto BadLongCode;
2212                             }
2213                             pSrc++;
2214
2215                             ch = (chc << 6) | (ch & 0x3F);
2216
2217                             // extra byte, we're only expecting 1 char for each of these 3 bytes,
2218                             // but the loop is testing the target (not source) against pStop, so
2219                             // we need to subtract 2 more or we risk overrunning the input.
2220                             // Subtract 1 here and one more below
2221                             pStop--;
2222                         }
2223                     }
2224                     else
2225                     {
2226                         // 2 byte encoding
2227
2228                         ch &= 0x1F;
2229
2230                         // check for non-shortest form
2231                         if (ch <= 1)
2232                         {
2233                             goto BadLongCode;
2234                         }
2235                         ch = (ch << 6) | chc;
2236                     }
2237
2238                     *pTarget = (char)ch;
2239                     pTarget++;
2240
2241                     // extra byte, we're only expecting 1 char for each of these 2 bytes,
2242                     // but the loop is testing the target (not source) against pStop.
2243                     // subtract an extra count from pStop so that we don't overrun the input.
2244                     pStop--;
2245                 }
2246 #endif // FASTLOOP
2247
2248                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2249
2250                 // no pending bits at this point
2251                 ch = 0;
2252                 continue;
2253
2254             BadLongCode:
2255                 pSrc -= 2;
2256                 ch = 0;
2257                 continue;
2258             }
2259
2260             if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2261             {
2262                 // Have to do fallback for invalid bytes
2263                 if (fallback == null)
2264                 {
2265                     if (baseDecoder == null)
2266                         fallback = this.decoderFallback.CreateFallbackBuffer();
2267                     else
2268                         fallback = baseDecoder.FallbackBuffer;
2269                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2270                 }
2271
2272                 // That'll back us up the appropriate # of bytes if we didn't get anywhere
2273                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
2274                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
2275                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
2276                 pSrc = pSrcForFallback;
2277                 pTarget = pTargetForFallback;
2278
2279                 if (!fallbackResult)
2280                 {
2281                     Debug.Assert(pSrc >= bytes || pTarget == chars,
2282                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2283
2284                     // Ran out of buffer space
2285                     // Need to throw an exception?
2286                     fallback.InternalReset();
2287                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
2288                 }
2289                 Debug.Assert(pSrc >= bytes,
2290                     "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2291                 ch = 0;
2292             }
2293
2294             if (baseDecoder != null)
2295             {
2296                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2297
2298                 // If we're storing flush data we expect all bits to be used or else
2299                 // we're stuck in the middle of a conversion
2300                 Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow,
2301                     "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2302
2303                 // Remember our leftover bits.
2304                 decoder.bits = ch;
2305
2306                 baseDecoder._bytesUsed = (int)(pSrc - bytes);
2307             }
2308
2309             // Shouldn't have anything in fallback buffer for GetChars
2310             // (don't have to check _throwOnOverflow for chars)
2311             Debug.Assert(fallback == null || fallback.Remaining == 0,
2312                 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2313
2314             return PtrDiff(pTarget, chars);
2315         }
2316
2317         // During GetChars we had an invalid byte sequence
2318         // pSrc is backed up to the start of the bad sequence if we didn't have room to
2319         // fall it back.  Otherwise pSrc remains where it is.
2320         private unsafe bool FallbackInvalidByteSequence(
2321             ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2322         {
2323             // Get our byte[]
2324             byte* pStart = pSrc;
2325             byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2326
2327             // Do the actual fallback
2328             if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2329             {
2330                 // Oops, it failed, back up to pStart
2331                 pSrc = pStart;
2332                 return false;
2333             }
2334
2335             // It worked
2336             return true;
2337         }
2338
2339         // During GetCharCount we had an invalid byte sequence
2340         // pSrc is used to find the index that points to the invalid bytes,
2341         // however the byte[] contains the fallback bytes (in case the index is -1)
2342         private unsafe int FallbackInvalidByteSequence(
2343             byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2344         {
2345             // Get our byte[]
2346             byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2347
2348             // Do the actual fallback
2349             int count = fallback.InternalFallback(bytesUnknown, pSrc);
2350
2351             // # of fallback chars expected.
2352             // Note that we only get here for "long" sequences, and have already unreserved
2353             // the count that we prereserved for the input bytes
2354             return count;
2355         }
2356
2357         // Note that some of these bytes may have come from a previous fallback, so we cannot
2358         // just decrement the pointer and use the values we read.  In those cases we have 
2359         // to regenerate the original values.
2360         private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2361         {
2362             // Get our byte[]
2363             byte[] bytesUnknown = null;
2364
2365             // See if it was a plain char
2366             // (have to check >= 0 because we have all sorts of wierd bit flags)
2367             if (ch < 0x100 && ch >= 0)
2368             {
2369                 pSrc--;
2370                 bytesUnknown = new byte[] { unchecked((byte)ch) };
2371             }
2372             // See if its an unfinished 2 byte sequence
2373             else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2374             {
2375                 pSrc--;
2376                 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
2377             }
2378             // So now we're either 2nd byte of 3 or 4 byte sequence or
2379             // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2380             // 1st check if its a 4 byte sequence
2381             else if ((ch & SupplimentarySeq) != 0)
2382             {
2383                 //  3rd byte of 4 byte sequence?
2384                 if ((ch & (FinalByte >> 6)) != 0)
2385                 {
2386                     // 3rd byte of 4 byte sequence
2387                     pSrc -= 3;
2388                     bytesUnknown = new byte[] {
2389                         unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2390                         unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2391                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2392                 }
2393                 else if ((ch & (FinalByte >> 12)) != 0)
2394                 {
2395                     // 2nd byte of a 4 byte sequence
2396                     pSrc -= 2;
2397                     bytesUnknown = new byte[] {
2398                         unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2399                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2400                 }
2401                 else
2402                 {
2403                     // 4th byte of a 4 byte sequence
2404                     pSrc--;
2405                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
2406                 }
2407             }
2408             else
2409             {
2410                 // 2nd byte of 3 byte sequence?
2411                 if ((ch & (FinalByte >> 6)) != 0)
2412                 {
2413                     // So its 2nd byte of a 3 byte sequence
2414                     pSrc -= 2;
2415                     bytesUnknown = new byte[] {
2416                         unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2417                 }
2418                 else
2419                 {
2420                     // 1st byte of a 3 byte sequence
2421                     pSrc--;
2422                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
2423                 }
2424             }
2425
2426             return bytesUnknown;
2427         }
2428
2429
2430         public override Decoder GetDecoder()
2431         {
2432             return new UTF8Decoder(this);
2433         }
2434
2435
2436         public override Encoder GetEncoder()
2437         {
2438             return new UTF8Encoder(this);
2439         }
2440
2441
2442         public override int GetMaxByteCount(int charCount)
2443         {
2444             if (charCount < 0)
2445                 throw new ArgumentOutOfRangeException(nameof(charCount),
2446                      SR.ArgumentOutOfRange_NeedNonNegNum);
2447             Contract.EndContractBlock();
2448
2449             // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2450             long byteCount = (long)charCount + 1;
2451
2452             if (EncoderFallback.MaxCharCount > 1)
2453                 byteCount *= EncoderFallback.MaxCharCount;
2454
2455             // Max 3 bytes per char.  (4 bytes per 2 chars for surrogates)
2456             byteCount *= 3;
2457
2458             if (byteCount > 0x7fffffff)
2459                 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
2460
2461             return (int)byteCount;
2462         }
2463
2464
2465         public override int GetMaxCharCount(int byteCount)
2466         {
2467             if (byteCount < 0)
2468                 throw new ArgumentOutOfRangeException(nameof(byteCount),
2469                      SR.ArgumentOutOfRange_NeedNonNegNum);
2470             Contract.EndContractBlock();
2471
2472             // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2473             long charCount = ((long)byteCount + 1);
2474
2475             // Non-shortest form would fall back, so get max count from fallback.
2476             // So would 11... followed by 11..., so you could fall back every byte
2477             if (DecoderFallback.MaxCharCount > 1)
2478             {
2479                 charCount *= DecoderFallback.MaxCharCount;
2480             }
2481
2482             if (charCount > 0x7fffffff)
2483                 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
2484
2485             return (int)charCount;
2486         }
2487
2488
2489         public override byte[] GetPreamble()
2490         {
2491             if (_emitUTF8Identifier)
2492             {
2493                 // Allocate new array to prevent users from modifying it.
2494                 return new byte[3] { 0xEF, 0xBB, 0xBF };
2495             }
2496             else
2497                 return Array.Empty<byte>();
2498         }
2499
2500
2501         public override bool Equals(Object value)
2502         {
2503             UTF8Encoding that = value as UTF8Encoding;
2504             if (that != null)
2505             {
2506                 return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
2507                        (EncoderFallback.Equals(that.EncoderFallback)) &&
2508                        (DecoderFallback.Equals(that.DecoderFallback));
2509             }
2510             return (false);
2511         }
2512
2513
2514         public override int GetHashCode()
2515         {
2516             //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2517             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2518                    UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
2519         }
2520
2521         private sealed class UTF8Encoder : EncoderNLS
2522         {
2523             // We must save a high surrogate value until the next call, looking
2524             // for a low surrogate value.
2525             internal int surrogateChar;
2526
2527             public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2528             {
2529                 // base calls reset
2530             }
2531
2532             public override void Reset()
2533
2534             {
2535                 this.surrogateChar = 0;
2536                 if (_fallbackBuffer != null)
2537                     _fallbackBuffer.Reset();
2538             }
2539
2540             // Anything left in our encoder?
2541             internal override bool HasState
2542             {
2543                 get
2544                 {
2545                     return (this.surrogateChar != 0);
2546                 }
2547             }
2548         }
2549
2550         private sealed class UTF8Decoder : DecoderNLS
2551         {
2552             // We'll need to remember the previous information. See the comments around definition
2553             // of FinalByte for details.
2554             internal int bits;
2555
2556             public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2557             {
2558                 // base calls reset
2559             }
2560
2561             public override void Reset()
2562             {
2563                 this.bits = 0;
2564                 if (_fallbackBuffer != null)
2565                     _fallbackBuffer.Reset();
2566             }
2567
2568             // Anything left in our decoder?
2569             internal override bool HasState
2570             {
2571                 get
2572                 {
2573                     return (this.bits != 0);
2574                 }
2575             }
2576         }
2577     }
2578 }