ee5c92c3a869f9f71a0c5893438a42c99fb94439
[platform/upstream/coreclr.git] / src / mscorlib / shared / System / Text / UTF8Encoding.cs
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 // The worker functions in this file was optimized for performance. If you make changes
6 // you should use care to consider all of the interesting cases.
7
8 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
9 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
10 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
11 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
12
13 // This define can be used to turn off the fast loops. Useful for finding whether
14 // the problem is fastloop-specific.
15 #define FASTLOOP
16
17 using System;
18 using System.Runtime.Serialization;
19 using System.Diagnostics;
20 using System.Diagnostics.Contracts;
21 using System.Globalization;
22
23 namespace System.Text
24 {
25     // Encodes text into and out of UTF-8.  UTF-8 is a way of writing
26     // Unicode characters with variable numbers of bytes per character,
27     // optimized for the lower 127 ASCII characters.  It's an efficient way
28     // of encoding US English in an internationalizable way.
29     //
30     // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
31     //
32     // The UTF-8 byte order mark is simply the Unicode byte order mark
33     // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark is
34     // used mostly to distinguish UTF-8 text from other encodings, and doesn't
35     // switch the byte orderings.
36
37     public class UTF8Encoding : Encoding
38     {
39         /*
40             bytes   bits    UTF-8 representation
41             -----   ----    -----------------------------------
42             1        7      0vvvvvvv
43             2       11      110vvvvv 10vvvvvv
44             3       16      1110vvvv 10vvvvvv 10vvvvvv
45             4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
46             -----   ----    -----------------------------------
47
48             Surrogate:
49             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
50         */
51
52         private const int UTF8_CODEPAGE = 65001;
53
54         // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230)
55         internal sealed class UTF8EncodingSealed : UTF8Encoding
56         {
57             public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
58         }
59
60         // Used by Encoding.UTF8 for lazy initialization
61         // The initialization code will not be run until a static member of the class is referenced
62         internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
63
64         // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
65         // the standard.
66         private bool _emitUTF8Identifier = false;
67
68         private bool _isThrowException = false;
69
70
71         public UTF8Encoding() : this(false)
72         {
73         }
74
75
76         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
77             this(encoderShouldEmitUTF8Identifier, false)
78         {
79         }
80
81
82         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
83             base(UTF8_CODEPAGE)
84         {
85             _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
86             _isThrowException = throwOnInvalidBytes;
87
88             // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
89             if (_isThrowException)
90                 SetDefaultFallbacks();
91         }
92
93         internal override void SetDefaultFallbacks()
94         {
95             // For UTF-X encodings, we use a replacement fallback with an empty string
96             if (_isThrowException)
97             {
98                 this.encoderFallback = EncoderFallback.ExceptionFallback;
99                 this.decoderFallback = DecoderFallback.ExceptionFallback;
100             }
101             else
102             {
103                 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
104                 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
105             }
106         }
107
108
109         // WARNING: GetByteCount(string chars)
110         // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
111         // WARNING: otherwise it'll break VB's way of declaring these.
112         //
113         // The following methods are copied from EncodingNLS.cs.
114         // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
115         // These should be kept in sync for the following classes:
116         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
117
118         // Returns the number of bytes required to encode a range of characters in
119         // a character array.
120         //
121         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
122         // So if you fix this, fix the others.  Currently those include:
123         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
124         // parent method is safe
125
126         public override unsafe int GetByteCount(char[] chars, int index, int count)
127         {
128             // Validate input parameters
129             if (chars == null)
130                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
131
132             if (index < 0 || count < 0)
133                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
134
135             if (chars.Length - index < count)
136                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
137             Contract.EndContractBlock();
138
139             // If no input, return 0, avoid fixed empty array problem
140             if (count == 0)
141                 return 0;
142
143             // Just call the pointer version
144             fixed (char* pChars = chars)
145                 return GetByteCount(pChars + index, count, null);
146         }
147
148         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
149         // So if you fix this, fix the others.  Currently those include:
150         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
151         // parent method is safe
152
153         public override unsafe int GetByteCount(String chars)
154         {
155             // Validate input
156             if (chars==null)
157                 throw new ArgumentNullException("s");
158             Contract.EndContractBlock();
159
160             fixed (char* pChars = chars)
161                 return GetByteCount(pChars, chars.Length, null);
162         }
163
164         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
165         // So if you fix this, fix the others.  Currently those include:
166         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
167
168         [CLSCompliant(false)]
169         public override unsafe int GetByteCount(char* chars, int count)
170         {
171             // Validate Parameters
172             if (chars == null)
173                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
174
175             if (count < 0)
176                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
177             Contract.EndContractBlock();
178
179             // Call it with empty encoder
180             return GetByteCount(chars, count, null);
181         }
182
183         // Parent method is safe.
184         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
185         // So if you fix this, fix the others.  Currently those include:
186         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
187
188         public override unsafe int GetBytes(String s, int charIndex, int charCount,
189                                               byte[] bytes, int byteIndex)
190         {
191             if (s == null || bytes == null)
192                 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
193
194             if (charIndex < 0 || charCount < 0)
195                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
196
197             if (s.Length - charIndex < charCount)
198                 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
199
200             if (byteIndex < 0 || byteIndex > bytes.Length)
201                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
202             Contract.EndContractBlock();
203
204             int byteCount = bytes.Length - byteIndex;
205
206             // Fixed doesn't like 0 length arrays.
207             if (bytes.Length == 0)
208                 bytes = new byte[1];
209
210             fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
211                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
212         }
213
214         // Encodes a range of characters in a character array into a range of bytes
215         // in a byte array. An exception occurs if the byte array is not large
216         // enough to hold the complete encoding of the characters. The
217         // GetByteCount method can be used to determine the exact number of
218         // bytes that will be produced for a given range of characters.
219         // Alternatively, the GetMaxByteCount method can be used to
220         // determine the maximum number of bytes that will be produced for a given
221         // number of characters, regardless of the actual character values.
222         //
223         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
224         // So if you fix this, fix the others.  Currently those include:
225         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
226         // parent method is safe
227
228         public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
229                                                byte[] bytes, int byteIndex)
230         {
231             // Validate parameters
232             if (chars == null || bytes == null)
233                 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
234
235             if (charIndex < 0 || charCount < 0)
236                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
237
238             if (chars.Length - charIndex < charCount)
239                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
240
241             if (byteIndex < 0 || byteIndex > bytes.Length)
242                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
243             Contract.EndContractBlock();
244
245             // If nothing to encode return 0, avoid fixed problem
246             if (charCount == 0)
247                 return 0;
248
249             // Just call pointer version
250             int byteCount = bytes.Length - byteIndex;
251
252             // Fixed doesn't like 0 length arrays.
253             if (bytes.Length == 0)
254                 bytes = new byte[1];
255
256             fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
257                 // Remember that byteCount is # to decode, not size of array.
258                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
259         }
260
261         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
262         // So if you fix this, fix the others.  Currently those include:
263         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
264
265         [CLSCompliant(false)]
266         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
267         {
268             // Validate Parameters
269             if (bytes == null || chars == null)
270                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
271
272             if (charCount < 0 || byteCount < 0)
273                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
274             Contract.EndContractBlock();
275
276             return GetBytes(chars, charCount, bytes, byteCount, null);
277         }
278
279         // Returns the number of characters produced by decoding a range of bytes
280         // in a byte array.
281         //
282         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
283         // So if you fix this, fix the others.  Currently those include:
284         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
285         // parent method is safe
286
287         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
288         {
289             // Validate Parameters
290             if (bytes == null)
291                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
292
293             if (index < 0 || count < 0)
294                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
295
296             if (bytes.Length - index < count)
297                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
298             Contract.EndContractBlock();
299
300             // If no input just return 0, fixed doesn't like 0 length arrays.
301             if (count == 0)
302                 return 0;
303
304             // Just call pointer version
305             fixed (byte* pBytes = bytes)
306                 return GetCharCount(pBytes + index, count, null);
307         }
308
309         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
310         // So if you fix this, fix the others.  Currently those include:
311         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
312
313         [CLSCompliant(false)]
314         public override unsafe int GetCharCount(byte* bytes, int count)
315         {
316             // Validate Parameters
317             if (bytes == null)
318                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
319
320             if (count < 0)
321                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
322             Contract.EndContractBlock();
323
324             return GetCharCount(bytes, count, null);
325         }
326
327         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
328         // So if you fix this, fix the others.  Currently those include:
329         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
330         // parent method is safe
331
332         public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
333                                               char[] chars, int charIndex)
334         {
335             // Validate Parameters
336             if (bytes == null || chars == null)
337                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
338
339             if (byteIndex < 0 || byteCount < 0)
340                 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
341
342             if ( bytes.Length - byteIndex < byteCount)
343                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
344
345             if (charIndex < 0 || charIndex > chars.Length)
346                 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
347             Contract.EndContractBlock();
348
349             // If no input, return 0 & avoid fixed problem
350             if (byteCount == 0)
351                 return 0;
352
353             // Just call pointer version
354             int charCount = chars.Length - charIndex;
355
356             // Fixed doesn't like 0 length arrays.
357             if (chars.Length == 0)
358                 chars = new char[1];
359
360             fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
361                 // Remember that charCount is # to decode, not size of array
362                 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
363         }
364
365         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
366         // So if you fix this, fix the others.  Currently those include:
367         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
368
369         [CLSCompliant(false)]
370         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
371         {
372             // Validate Parameters
373             if (bytes == null || chars == null)
374                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
375
376             if (charCount < 0 || byteCount < 0)
377                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
378             Contract.EndContractBlock();
379
380             return GetChars(bytes, byteCount, chars, charCount, null);
381         }
382
383         // Returns a string containing the decoded representation of a range of
384         // bytes in a byte array.
385         //
386         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
387         // So if you fix this, fix the others.  Currently those include:
388         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
389         // parent method is safe
390
391         public override unsafe String GetString(byte[] bytes, int index, int count)
392         {
393             // Validate Parameters
394             if (bytes == null)
395                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
396
397             if (index < 0 || count < 0)
398                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
399
400             if (bytes.Length - index < count)
401                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
402             Contract.EndContractBlock();
403
404             // Avoid problems with empty input buffer
405             if (count == 0) return String.Empty;
406
407             fixed (byte* pBytes = bytes)
408                 return String.CreateStringFromEncoding(
409                     pBytes + index, count, this);
410         }
411
412         //
413         // End of standard methods copied from EncodingNLS.cs
414         //
415
416         // To simplify maintenance, the structure of GetByteCount and GetBytes should be
417         // kept the same as much as possible
418         internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
419         {
420             // For fallback we may need a fallback buffer.
421             // We wait to initialize it though in case we don't have any broken input unicode
422             EncoderFallbackBuffer fallbackBuffer = null;
423             char* pSrcForFallback;
424
425             char* pSrc = chars;
426             char* pEnd = pSrc + count;
427
428             // Start by assuming we have as many as count
429             int byteCount = count;
430
431             int ch = 0;
432
433             if (baseEncoder != null)
434             {
435                 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
436                 ch = encoder.surrogateChar;
437
438                 // We mustn't have left over fallback data when counting
439                 if (encoder.InternalHasFallbackBuffer)
440                 {
441                     fallbackBuffer = encoder.FallbackBuffer;
442                     if (fallbackBuffer.Remaining > 0)
443                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
444
445                     // Set our internal fallback interesting things.
446                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
447                 }
448             }
449
450             for (;;)
451             {
452                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
453                 if (pSrc >= pEnd)
454                 {
455                     if (ch == 0)
456                     {
457                         // Unroll any fallback that happens at the end
458                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
459                         if (ch > 0)
460                         {
461                             byteCount++;
462                             goto ProcessChar;
463                         }
464                     }
465                     else
466                     {
467                         // Case of surrogates in the fallback.
468                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
469                         {
470                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
471                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
472
473                             ch = fallbackBuffer.InternalGetNextChar();
474                             byteCount++;
475
476                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
477                             {
478                                 ch = 0xfffd;
479                                 byteCount++;
480                                 goto EncodeChar;
481                             }
482                             else if (ch > 0)
483                             {
484                                 goto ProcessChar;
485                             }
486                             else
487                             {
488                                 byteCount--; // ignore last one.
489                                 break;
490                             }
491                         }
492                     }
493
494                     if (ch <= 0)
495                     {
496                         break;
497                     }
498                     if (baseEncoder != null && !baseEncoder.MustFlush)
499                     {
500                         break;
501                     }
502
503                     // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
504                     byteCount++;
505                     goto EncodeChar;
506                 }
507
508                 if (ch > 0)
509                 {
510                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
511                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
512
513                     // use separate helper variables for local contexts so that the jit optimizations
514                     // won't get confused about the variable lifetimes
515                     int cha = *pSrc;
516
517                     // count the pending surrogate
518                     byteCount++;
519
520                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
521                     // if (IsLowSurrogate(cha)) {
522                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
523                     {
524                         // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
525                         ch = 0xfffd;
526                         //                        ch = cha + (ch << 10) +
527                         //                            (0x10000
528                         //                            - CharUnicodeInfo.LOW_SURROGATE_START
529                         //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
530
531                         // Use this next char
532                         pSrc++;
533                     }
534                     // else ch is still high surrogate and encoding will fail (so don't add count)
535
536                     // attempt to encode the surrogate or partial surrogate
537                     goto EncodeChar;
538                 }
539
540                 // If we've used a fallback, then we have to check for it
541                 if (fallbackBuffer != null)
542                 {
543                     ch = fallbackBuffer.InternalGetNextChar();
544                     if (ch > 0)
545                     {
546                         // We have an extra byte we weren't expecting.
547                         byteCount++;
548                         goto ProcessChar;
549                     }
550                 }
551
552                 // read next char. The JIT optimization seems to be getting confused when
553                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
554                 ch = *pSrc;
555                 pSrc++;
556
557             ProcessChar:
558                 // if (IsHighSurrogate(ch)) {
559                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
560                 {
561                     // we will count this surrogate next time around
562                     byteCount--;
563                     continue;
564                 }
565             // either good char or partial surrogate
566
567             EncodeChar:
568                 // throw exception on partial surrogate if necessary
569                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
570                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
571                 {
572                     // Lone surrogates aren't allowed
573                     // Have to make a fallback buffer if we don't have one
574                     if (fallbackBuffer == null)
575                     {
576                         // wait on fallbacks if we can
577                         // For fallback we may need a fallback buffer
578                         if (baseEncoder == null)
579                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
580                         else
581                             fallbackBuffer = baseEncoder.FallbackBuffer;
582
583                         // Set our internal fallback interesting things.
584                         fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
585                     }
586
587                     // Do our fallback.  Actually we already know its a mixed up surrogate,
588                     // so the ref pSrc isn't gonna do anything.
589                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
590                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
591                     pSrc = pSrcForFallback;
592
593                     // Ignore it if we don't throw (we had preallocated this ch)
594                     byteCount--;
595                     ch = 0;
596                     continue;
597                 }
598
599                 // Count them
600                 if (ch > 0x7F)
601                 {
602                     if (ch > 0x7FF)
603                     {
604                         // the extra surrogate byte was compensated by the second surrogate character
605                         // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
606                         byteCount++;
607                     }
608                     byteCount++;
609                 }
610
611 #if BIT64
612                 // check for overflow
613                 if (byteCount < 0)
614                 {
615                     break;
616                 }
617 #endif
618
619 #if FASTLOOP
620                 // If still have fallback don't do fast loop
621                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
622                 {
623                     // We're reserving 1 byte for each char by default
624                     byteCount++;
625                     goto ProcessChar;
626                 }
627
628                 int availableChars = PtrDiff(pEnd, pSrc);
629
630                 // don't fall into the fast decoding loop if we don't have enough characters
631                 if (availableChars <= 13)
632                 {
633                     // try to get over the remainder of the ascii characters fast though
634                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
635                     while (pSrc < pLocalEnd)
636                     {
637                         ch = *pSrc;
638                         pSrc++;
639                         if (ch > 0x7F)
640                             goto ProcessChar;
641                     }
642
643                     // we are done
644                     break;
645                 }
646
647 #if BIT64
648                 // make sure that we won't get a silent overflow inside the fast loop
649                 // (Fall out to slow loop if we have this many characters)
650                 availableChars &= 0x0FFFFFFF;
651 #endif
652
653                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
654                 //  the boundary will be decreased for every non-ASCII character we encounter
655                 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
656                 char* pStop = pSrc + availableChars - (3 + 4);
657
658                 while (pSrc < pStop)
659                 {
660                     ch = *pSrc;
661                     pSrc++;
662
663                     if (ch > 0x7F)                                                  // Not ASCII
664                     {
665                         if (ch > 0x7FF)                                             // Not 2 Byte
666                         {
667                             if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
668                                 goto LongCode;
669                             byteCount++;
670                         }
671                         byteCount++;
672                     }
673
674                     // get pSrc aligned
675                     if ((unchecked((int)pSrc) & 0x2) != 0)
676                     {
677                         ch = *pSrc;
678                         pSrc++;
679                         if (ch > 0x7F)                                              // Not ASCII
680                         {
681                             if (ch > 0x7FF)                                         // Not 2 Byte
682                             {
683                                 if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
684                                     goto LongCode;
685                                 byteCount++;
686                             }
687                             byteCount++;
688                         }
689                     }
690
691                     // Run 2 * 4 characters at a time!
692                     while (pSrc < pStop)
693                     {
694                         ch = *(int*)pSrc;
695                         int chc = *(int*)(pSrc + 2);
696                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
697                         {
698                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
699                             {
700                                 goto LongCodeWithMask;
701                             }
702
703
704                             if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
705                                 byteCount++;
706                             if ((ch & unchecked((int)0xFF80)) != 0)
707                                 byteCount++;
708                             if ((chc & unchecked((int)0xFF800000)) != 0)
709                                 byteCount++;
710                             if ((chc & unchecked((int)0xFF80)) != 0)
711                                 byteCount++;
712                         }
713                         pSrc += 4;
714
715                         ch = *(int*)pSrc;
716                         chc = *(int*)(pSrc + 2);
717                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
718                         {
719                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
720                             {
721                                 goto LongCodeWithMask;
722                             }
723
724                             if ((ch & unchecked((int)0xFF800000)) != 0)
725                                 byteCount++;
726                             if ((ch & unchecked((int)0xFF80)) != 0)
727                                 byteCount++;
728                             if ((chc & unchecked((int)0xFF800000)) != 0)
729                                 byteCount++;
730                             if ((chc & unchecked((int)0xFF80)) != 0)
731                                 byteCount++;
732                         }
733                         pSrc += 4;
734                     }
735                     break;
736
737                 LongCodeWithMask:
738 #if BIGENDIAN
739                     // be careful about the sign extension
740                     ch = (int)(((uint)ch) >> 16);
741 #else // BIGENDIAN
742                     ch = (char)ch;
743 #endif // BIGENDIAN
744                     pSrc++;
745
746                     if (ch <= 0x7F)
747                     {
748                         continue;
749                     }
750
751                 LongCode:
752                     // use separate helper variables for slow and fast loop so that the jit optimizations
753                     // won't get confused about the variable lifetimes
754                     if (ch > 0x7FF)
755                     {
756                         // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
757                         if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
758                         {
759                             // 4 byte encoding - high surrogate + low surrogate
760
761                             int chd = *pSrc;
762                             if (
763                                 // !IsHighSurrogate(ch) // low without high -> bad
764                                 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
765                                 // !IsLowSurrogate(chd) // high not followed by low -> bad
766                                 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
767                             {
768                                 // Back up and drop out to slow loop to figure out error
769                                 pSrc--;
770                                 break;
771                             }
772                             pSrc++;
773
774                             // byteCount - this byte is compensated by the second surrogate character
775                         }
776                         byteCount++;
777                     }
778                     byteCount++;
779
780                     // byteCount - the last byte is already included
781                 }
782 #endif // FASTLOOP
783
784                 // no pending char at this point
785                 ch = 0;
786             }
787
788 #if BIT64
789             // check for overflow
790             if (byteCount < 0)
791             {
792                 throw new ArgumentException(
793                         SR.Argument_ConversionOverflow);
794             }
795 #endif
796
797             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
798                 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
799
800             return byteCount;
801         }
802
803         // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
804         // is good enough for us, and it tends to generate better code than the signed
805         // arithmetic generated by default
806         unsafe private static int PtrDiff(char* a, char* b)
807         {
808             return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
809         }
810
811         // byte* flavor just for parity
812         unsafe private static int PtrDiff(byte* a, byte* b)
813         {
814             return (int)(a - b);
815         }
816
817         private static bool InRange(int ch, int start, int end)
818         {
819             return (uint)(ch - start) <= (uint)(end - start);
820         }
821
822         // Our workhorse
823         // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
824         internal override unsafe int GetBytes(char* chars, int charCount,
825                                                 byte* bytes, int byteCount, EncoderNLS baseEncoder)
826         {
827             Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
828             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
829             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
830             Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
831
832             UTF8Encoder encoder = null;
833
834             // For fallback we may need a fallback buffer.
835             // We wait to initialize it though in case we don't have any broken input unicode
836             EncoderFallbackBuffer fallbackBuffer = null;
837             char* pSrcForFallback;
838
839             char* pSrc = chars;
840             byte* pTarget = bytes;
841
842             char* pEnd = pSrc + charCount;
843             byte* pAllocatedBufferEnd = pTarget + byteCount;
844
845             int ch = 0;
846
847             // assume that JIT will enregister pSrc, pTarget and ch
848
849             if (baseEncoder != null)
850             {
851                 encoder = (UTF8Encoder)baseEncoder;
852                 ch = encoder.surrogateChar;
853
854                 // We mustn't have left over fallback data when counting
855                 if (encoder.InternalHasFallbackBuffer)
856                 {
857                     // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
858                     fallbackBuffer = encoder.FallbackBuffer;
859                     if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
860                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
861
862                     // Set our internal fallback interesting things.
863                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
864                 }
865             }
866
867             for (;;)
868             {
869                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
870
871                 if (pSrc >= pEnd)
872                 {
873                     if (ch == 0)
874                     {
875                         // Check if there's anthing left to get out of the fallback buffer
876                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
877                         if (ch > 0)
878                         {
879                             goto ProcessChar;
880                         }
881                     }
882                     else
883                     {
884                         // Case of leftover surrogates in the fallback buffer
885                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
886                         {
887                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
888                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
889
890                             int cha = ch;
891
892                             ch = fallbackBuffer.InternalGetNextChar();
893
894                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
895                             {
896                                 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
897                                 goto EncodeChar;
898                             }
899                             else if (ch > 0)
900                             {
901                                 goto ProcessChar;
902                             }
903                             else
904                             {
905                                 break;
906                             }
907                         }
908                     }
909
910                     // attempt to encode the partial surrogate (will fail or ignore)
911                     if (ch > 0 && (encoder == null || encoder.MustFlush))
912                         goto EncodeChar;
913
914                     // We're done
915                     break;
916                 }
917
918                 if (ch > 0)
919                 {
920                     // We have a high surrogate left over from a previous loop.
921                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
922                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
923
924                     // use separate helper variables for local contexts so that the jit optimizations
925                     // won't get confused about the variable lifetimes
926                     int cha = *pSrc;
927
928                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
929                     // if (IsLowSurrogate(cha)) {
930                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
931                     {
932                         ch = cha + (ch << 10) +
933                             (0x10000
934                             - CharUnicodeInfo.LOW_SURROGATE_START
935                             - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
936
937                         pSrc++;
938                     }
939                     // else ch is still high surrogate and encoding will fail
940
941                     // attempt to encode the surrogate or partial surrogate
942                     goto EncodeChar;
943                 }
944
945                 // If we've used a fallback, then we have to check for it
946                 if (fallbackBuffer != null)
947                 {
948                     ch = fallbackBuffer.InternalGetNextChar();
949                     if (ch > 0) goto ProcessChar;
950                 }
951
952                 // read next char. The JIT optimization seems to be getting confused when
953                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
954                 ch = *pSrc;
955                 pSrc++;
956
957             ProcessChar:
958                 // if (IsHighSurrogate(ch)) {
959                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
960                 {
961                     continue;
962                 }
963             // either good char or partial surrogate
964
965             EncodeChar:
966                 // throw exception on partial surrogate if necessary
967                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
968                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
969                 {
970                     // Lone surrogates aren't allowed, we have to do fallback for them
971                     // Have to make a fallback buffer if we don't have one
972                     if (fallbackBuffer == null)
973                     {
974                         // wait on fallbacks if we can
975                         // For fallback we may need a fallback buffer
976                         if (baseEncoder == null)
977                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
978                         else
979                             fallbackBuffer = baseEncoder.FallbackBuffer;
980
981                         // Set our internal fallback interesting things.
982                         fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
983                     }
984
985                     // Do our fallback.  Actually we already know its a mixed up surrogate,
986                     // so the ref pSrc isn't gonna do anything.
987                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
988                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
989                     pSrc = pSrcForFallback;
990
991                     // Ignore it if we don't throw
992                     ch = 0;
993                     continue;
994                 }
995
996                 // Count bytes needed
997                 int bytesNeeded = 1;
998                 if (ch > 0x7F)
999                 {
1000                     if (ch > 0x7FF)
1001                     {
1002                         if (ch > 0xFFFF)
1003                         {
1004                             bytesNeeded++;  // 4 bytes (surrogate pair)
1005                         }
1006                         bytesNeeded++;      // 3 bytes (800-FFFF)
1007                     }
1008                     bytesNeeded++;          // 2 bytes (80-7FF)
1009                 }
1010
1011                 if (pTarget > pAllocatedBufferEnd - bytesNeeded)
1012                 {
1013                     // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1014                     if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1015                     {
1016                         fallbackBuffer.MovePrevious();              // Didn't use this fallback char
1017                         if (ch > 0xFFFF)
1018                             fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
1019                     }
1020                     else
1021                     {
1022                         pSrc--;                                     // Didn't use this char
1023                         if (ch > 0xFFFF)
1024                             pSrc--;                                 // Was surrogate, didn't use 2nd part either
1025                     }
1026                     Debug.Assert(pSrc >= chars || pTarget == bytes,
1027                         "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1028                     ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
1029                     ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
1030                     break;
1031                 }
1032
1033                 if (ch <= 0x7F)
1034                 {
1035                     *pTarget = (byte)ch;
1036                 }
1037                 else
1038                 {
1039                     // use separate helper variables for local contexts so that the jit optimizations
1040                     // won't get confused about the variable lifetimes
1041                     int chb;
1042                     if (ch <= 0x7FF)
1043                     {
1044                         // 2 byte encoding
1045                         chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1046                     }
1047                     else
1048                     {
1049                         if (ch <= 0xFFFF)
1050                         {
1051                             chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1052                         }
1053                         else
1054                         {
1055                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1056                             pTarget++;
1057
1058                             chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1059                         }
1060                         *pTarget = (byte)chb;
1061                         pTarget++;
1062
1063                         chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1064                     }
1065                     *pTarget = (byte)chb;
1066                     pTarget++;
1067
1068                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1069                 }
1070                 pTarget++;
1071
1072
1073 #if FASTLOOP
1074                 // If still have fallback don't do fast loop
1075                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1076                     goto ProcessChar;
1077
1078                 int availableChars = PtrDiff(pEnd, pSrc);
1079                 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1080
1081                 // don't fall into the fast decoding loop if we don't have enough characters
1082                 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1083                 if (availableChars <= 13)
1084                 {
1085                     // we are hoping for 1 byte per char
1086                     if (availableBytes < availableChars)
1087                     {
1088                         // not enough output room.  no pending bits at this point
1089                         ch = 0;
1090                         continue;
1091                     }
1092
1093                     // try to get over the remainder of the ascii characters fast though
1094                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1095                     while (pSrc < pLocalEnd)
1096                     {
1097                         ch = *pSrc;
1098                         pSrc++;
1099
1100                         // Not ASCII, need more than 1 byte per char
1101                         if (ch > 0x7F)
1102                             goto ProcessChar;
1103
1104                         *pTarget = (byte)ch;
1105                         pTarget++;
1106                     }
1107                     // we are done, let ch be 0 to clear encoder
1108                     ch = 0;
1109                     break;
1110                 }
1111
1112                 // we need at least 1 byte per character, but Convert might allow us to convert
1113                 // only part of the input, so try as much as we can.  Reduce charCount if necessary
1114                 if (availableBytes < availableChars)
1115                 {
1116                     availableChars = availableBytes;
1117                 }
1118
1119                 // FASTLOOP:
1120                 // - optimistic range checks
1121                 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1122
1123                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1124                 //  the boundary will be decreased for every non-ASCII character we encounter
1125                 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1126                 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1127                 char* pStop = pSrc + availableChars - 5;
1128
1129                 while (pSrc < pStop)
1130                 {
1131                     ch = *pSrc;
1132                     pSrc++;
1133
1134                     if (ch > 0x7F)
1135                     {
1136                         goto LongCode;
1137                     }
1138                     *pTarget = (byte)ch;
1139                     pTarget++;
1140
1141                     // get pSrc aligned
1142                     if ((unchecked((int)pSrc) & 0x2) != 0)
1143                     {
1144                         ch = *pSrc;
1145                         pSrc++;
1146                         if (ch > 0x7F)
1147                         {
1148                             goto LongCode;
1149                         }
1150                         *pTarget = (byte)ch;
1151                         pTarget++;
1152                     }
1153
1154                     // Run 4 characters at a time!
1155                     while (pSrc < pStop)
1156                     {
1157                         ch = *(int*)pSrc;
1158                         int chc = *(int*)(pSrc + 2);
1159                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
1160                         {
1161                             goto LongCodeWithMask;
1162                         }
1163
1164                         // Unfortunately, this is endianess sensitive
1165 #if BIGENDIAN
1166                         *pTarget = (byte)(ch>>16);
1167                         *(pTarget+1) = (byte)ch;
1168                         pSrc += 4;
1169                         *(pTarget+2) = (byte)(chc>>16);
1170                         *(pTarget+3) = (byte)chc;
1171                         pTarget += 4;
1172 #else // BIGENDIAN
1173                         *pTarget = (byte)ch;
1174                         *(pTarget + 1) = (byte)(ch >> 16);
1175                         pSrc += 4;
1176                         *(pTarget + 2) = (byte)chc;
1177                         *(pTarget + 3) = (byte)(chc >> 16);
1178                         pTarget += 4;
1179 #endif // BIGENDIAN
1180                     }
1181                     continue;
1182
1183                 LongCodeWithMask:
1184 #if BIGENDIAN
1185                     // be careful about the sign extension
1186                     ch = (int)(((uint)ch) >> 16);
1187 #else // BIGENDIAN
1188                     ch = (char)ch;
1189 #endif // BIGENDIAN
1190                     pSrc++;
1191
1192                     if (ch > 0x7F)
1193                     {
1194                         goto LongCode;
1195                     }
1196                     *pTarget = (byte)ch;
1197                     pTarget++;
1198                     continue;
1199
1200                 LongCode:
1201                     // use separate helper variables for slow and fast loop so that the jit optimizations
1202                     // won't get confused about the variable lifetimes
1203                     int chd;
1204                     if (ch <= 0x7FF)
1205                     {
1206                         // 2 byte encoding
1207                         chd = unchecked((sbyte)0xC0) | (ch >> 6);
1208                     }
1209                     else
1210                     {
1211                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1212                         if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1213                         {
1214                             // 3 byte encoding
1215                             chd = unchecked((sbyte)0xE0) | (ch >> 12);
1216                         }
1217                         else
1218                         {
1219                             // 4 byte encoding - high surrogate + low surrogate
1220                             // if (!IsHighSurrogate(ch))
1221                             if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
1222                             {
1223                                 // low without high -> bad, try again in slow loop
1224                                 pSrc -= 1;
1225                                 break;
1226                             }
1227
1228                             chd = *pSrc;
1229                             pSrc++;
1230
1231                             // if (!IsLowSurrogate(chd)) {
1232                             if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1233                             {
1234                                 // high not followed by low -> bad, try again in slow loop
1235                                 pSrc -= 2;
1236                                 break;
1237                             }
1238
1239                             ch = chd + (ch << 10) +
1240                                 (0x10000
1241                                 - CharUnicodeInfo.LOW_SURROGATE_START
1242                                 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
1243
1244                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1245                             // pStop - this byte is compensated by the second surrogate character
1246                             // 2 input chars require 4 output bytes.  2 have been anticipated already
1247                             // and 2 more will be accounted for by the 2 pStop-- calls below.
1248                             pTarget++;
1249
1250                             chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1251                         }
1252                         *pTarget = (byte)chd;
1253                         pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1254                         pTarget++;
1255
1256                         chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1257                     }
1258                     *pTarget = (byte)chd;
1259                     pStop--;                        // 2 byte sequence for 1 char so need pStop--.
1260                     pTarget++;
1261
1262                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1263                     // pStop - this byte is already included
1264                     pTarget++;
1265                 }
1266
1267                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1268
1269 #endif // FASTLOOP
1270
1271                 // no pending char at this point
1272                 ch = 0;
1273             }
1274
1275             // Do we have to set the encoder bytes?
1276             if (encoder != null)
1277             {
1278                 Debug.Assert(!encoder.MustFlush || ch == 0,
1279                     "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1280
1281                 encoder.surrogateChar = ch;
1282                 encoder.m_charsUsed = (int)(pSrc - chars);
1283             }
1284
1285             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1286                 baseEncoder == null || !baseEncoder.m_throwOnOverflow,
1287                 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1288
1289             return (int)(pTarget - bytes);
1290         }
1291
1292
1293         // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1294         // while the actual character is being built in the lower bits. They are shifted together
1295         // with the actual bits of the character.
1296
1297         // bits 30 & 31 are used for pending bits fixup
1298         private const int FinalByte = 1 << 29;
1299         private const int SupplimentarySeq = 1 << 28;
1300         private const int ThreeByteSeq = 1 << 27;
1301
1302         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1303         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1304         //
1305         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1306         // kept the same as much as possible
1307         internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1308         {
1309             Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1310             Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
1311
1312             // Initialize stuff
1313             byte* pSrc = bytes;
1314             byte* pEnd = pSrc + count;
1315
1316             // Start by assuming we have as many as count, charCount always includes the adjustment
1317             // for the character being decoded
1318             int charCount = count;
1319             int ch = 0;
1320             DecoderFallbackBuffer fallback = null;
1321
1322             if (baseDecoder != null)
1323             {
1324                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1325                 ch = decoder.bits;
1326                 charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
1327
1328                 // Shouldn't have anything in fallback buffer for GetCharCount
1329                 // (don't have to check m_throwOnOverflow for count)
1330                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1331                     "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1332             }
1333
1334             for (;;)
1335             {
1336                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1337
1338                 if (pSrc >= pEnd)
1339                 {
1340                     break;
1341                 }
1342
1343                 if (ch == 0)
1344                 {
1345                     // no pending bits
1346                     goto ReadChar;
1347                 }
1348
1349                 // read next byte. The JIT optimization seems to be getting confused when
1350                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1351                 int cha = *pSrc;
1352                 pSrc++;
1353
1354                 // we are expecting to see trailing bytes like 10vvvvvv
1355                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1356                 {
1357                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1358                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1359                     pSrc--;
1360                     charCount += (ch >> 30);
1361                     goto InvalidByteSequence;
1362                 }
1363
1364                 // fold in the new byte
1365                 ch = (ch << 6) | (cha & 0x3F);
1366
1367                 if ((ch & FinalByte) == 0)
1368                 {
1369                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1370                         "[UTF8Encoding.GetChars]Invariant volation");
1371
1372                     if ((ch & SupplimentarySeq) != 0)
1373                     {
1374                         if ((ch & (FinalByte >> 6)) != 0)
1375                         {
1376                             // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1377                             continue;
1378                         }
1379
1380                         // 2nd byte, check for non-shortest form of supplimentary char and the valid
1381                         // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1382                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1383                         {
1384                             goto InvalidByteSequence;
1385                         }
1386                     }
1387                     else
1388                     {
1389                         // Must be 2nd byte of a 3-byte sequence
1390                         // check for non-shortest form of 3 byte seq
1391                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1392                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1393                         {
1394                             goto InvalidByteSequence;
1395                         }
1396                     }
1397                     continue;
1398                 }
1399
1400                 // ready to punch
1401
1402                 // adjust for surrogates in non-shortest form
1403                 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
1404                 {
1405                     charCount--;
1406                 }
1407                 goto EncodeChar;
1408
1409             InvalidByteSequence:
1410                 // this code fragment should be close to the gotos referencing it
1411                 // Have to do fallback for invalid bytes
1412                 if (fallback == null)
1413                 {
1414                     if (baseDecoder == null)
1415                         fallback = this.decoderFallback.CreateFallbackBuffer();
1416                     else
1417                         fallback = baseDecoder.FallbackBuffer;
1418                     fallback.InternalInitialize(bytes, null);
1419                 }
1420                 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1421
1422                 ch = 0;
1423                 continue;
1424
1425             ReadChar:
1426                 ch = *pSrc;
1427                 pSrc++;
1428
1429             ProcessChar:
1430                 if (ch > 0x7F)
1431                 {
1432                     // If its > 0x7F, its start of a new multi-byte sequence
1433
1434                     // Long sequence, so unreserve our char.
1435                     charCount--;
1436
1437                     // bit 6 has to be non-zero for start of multibyte chars.
1438                     if ((ch & 0x40) == 0)
1439                     {
1440                         // Unexpected trail byte
1441                         goto InvalidByteSequence;
1442                     }
1443
1444                     // start a new long code
1445                     if ((ch & 0x20) != 0)
1446                     {
1447                         if ((ch & 0x10) != 0)
1448                         {
1449                             // 4 byte encoding - supplimentary character (2 surrogates)
1450
1451                             ch &= 0x0F;
1452
1453                             // check that bit 4 is zero and the valid supplimentary character
1454                             // range 0x000000 - 0x10FFFF at the same time
1455                             if (ch > 0x04)
1456                             {
1457                                 ch |= 0xf0;
1458                                 goto InvalidByteSequence;
1459                             }
1460
1461                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1462                             // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1463                             ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
1464                                   (1 << 30) |           // If it dies on next byte we'll need an extra char
1465                                   (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
1466                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1467                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1468
1469                             // Our character count will be 2 characters for these 4 bytes, so subtract another char
1470                             charCount--;
1471                         }
1472                         else
1473                         {
1474                             // 3 byte encoding
1475                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1476                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1477                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1478
1479                             // We'll expect 1 character for these 3 bytes, so subtract another char.
1480                             charCount--;
1481                         }
1482                     }
1483                     else
1484                     {
1485                         // 2 byte encoding
1486
1487                         ch &= 0x1F;
1488
1489                         // check for non-shortest form
1490                         if (ch <= 1)
1491                         {
1492                             ch |= 0xc0;
1493                             goto InvalidByteSequence;
1494                         }
1495
1496                         // Add bit flags so we'll be flagged correctly
1497                         ch |= (FinalByte >> 6);
1498                     }
1499                     continue;
1500                 }
1501
1502             EncodeChar:
1503
1504 #if FASTLOOP
1505                 int availableBytes = PtrDiff(pEnd, pSrc);
1506
1507                 // don't fall into the fast decoding loop if we don't have enough bytes
1508                 if (availableBytes <= 13)
1509                 {
1510                     // try to get over the remainder of the ascii characters fast though
1511                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1512                     while (pSrc < pLocalEnd)
1513                     {
1514                         ch = *pSrc;
1515                         pSrc++;
1516
1517                         if (ch > 0x7F)
1518                             goto ProcessChar;
1519                     }
1520                     // we are done
1521                     ch = 0;
1522                     break;
1523                 }
1524
1525                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1526                 //  the boundary will be decreased for every non-ASCII character we encounter
1527                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1528                 byte* pStop = pSrc + availableBytes - 7;
1529
1530                 while (pSrc < pStop)
1531                 {
1532                     ch = *pSrc;
1533                     pSrc++;
1534
1535                     if (ch > 0x7F)
1536                     {
1537                         goto LongCode;
1538                     }
1539
1540                     // get pSrc 2-byte aligned
1541                     if ((unchecked((int)pSrc) & 0x1) != 0)
1542                     {
1543                         ch = *pSrc;
1544                         pSrc++;
1545                         if (ch > 0x7F)
1546                         {
1547                             goto LongCode;
1548                         }
1549                     }
1550
1551                     // get pSrc 4-byte aligned
1552                     if ((unchecked((int)pSrc) & 0x2) != 0)
1553                     {
1554                         ch = *(ushort*)pSrc;
1555                         if ((ch & 0x8080) != 0)
1556                         {
1557                             goto LongCodeWithMask16;
1558                         }
1559                         pSrc += 2;
1560                     }
1561
1562                     // Run 8 + 8 characters at a time!
1563                     while (pSrc < pStop)
1564                     {
1565                         ch = *(int*)pSrc;
1566                         int chb = *(int*)(pSrc + 4);
1567                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1568                         {
1569                             goto LongCodeWithMask32;
1570                         }
1571                         pSrc += 8;
1572
1573                         // This is a really small loop - unroll it
1574                         if (pSrc >= pStop)
1575                             break;
1576
1577                         ch = *(int*)pSrc;
1578                         chb = *(int*)(pSrc + 4);
1579                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1580                         {
1581                             goto LongCodeWithMask32;
1582                         }
1583                         pSrc += 8;
1584                     }
1585                     break;
1586
1587 #if BIGENDIAN
1588                 LongCodeWithMask32:
1589                     // be careful about the sign extension
1590                     ch = (int)(((uint)ch) >> 16);
1591                 LongCodeWithMask16:
1592                     ch = (int)(((uint)ch) >> 8);
1593 #else // BIGENDIAN
1594                 LongCodeWithMask32:
1595                 LongCodeWithMask16:
1596                     ch &= 0xFF;
1597 #endif // BIGENDIAN
1598                     pSrc++;
1599                     if (ch <= 0x7F)
1600                     {
1601                         continue;
1602                     }
1603
1604                 LongCode:
1605                     int chc = *pSrc;
1606                     pSrc++;
1607
1608                     if (
1609                         // bit 6 has to be zero
1610                         (ch & 0x40) == 0 ||
1611                         // we are expecting to see trailing bytes like 10vvvvvv
1612                         (chc & unchecked((sbyte)0xC0)) != 0x80)
1613                     {
1614                         goto BadLongCode;
1615                     }
1616
1617                     chc &= 0x3F;
1618
1619                     // start a new long code
1620                     if ((ch & 0x20) != 0)
1621                     {
1622                         // fold the first two bytes together
1623                         chc |= (ch & 0x0F) << 6;
1624
1625                         if ((ch & 0x10) != 0)
1626                         {
1627                             // 4 byte encoding - surrogate
1628                             ch = *pSrc;
1629                             if (
1630                                 // check that bit 4 is zero, the non-shortest form of surrogate
1631                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1632                                 !InRange(chc >> 4, 0x01, 0x10) ||
1633                                 // we are expecting to see trailing bytes like 10vvvvvv
1634                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1635                             {
1636                                 goto BadLongCode;
1637                             }
1638
1639                             chc = (chc << 6) | (ch & 0x3F);
1640
1641                             ch = *(pSrc + 1);
1642                             // we are expecting to see trailing bytes like 10vvvvvv
1643                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
1644                             {
1645                                 goto BadLongCode;
1646                             }
1647                             pSrc += 2;
1648
1649                             // extra byte
1650                             charCount--;
1651                         }
1652                         else
1653                         {
1654                             // 3 byte encoding
1655                             ch = *pSrc;
1656                             if (
1657                                 // check for non-shortest form of 3 byte seq
1658                                 (chc & (0x1F << 5)) == 0 ||
1659                                 // Can't have surrogates here.
1660                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1661                                 // we are expecting to see trailing bytes like 10vvvvvv
1662                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1663                             {
1664                                 goto BadLongCode;
1665                             }
1666                             pSrc++;
1667
1668                             // extra byte
1669                             charCount--;
1670                         }
1671                     }
1672                     else
1673                     {
1674                         // 2 byte encoding
1675
1676                         // check for non-shortest form
1677                         if ((ch & 0x1E) == 0)
1678                         {
1679                             goto BadLongCode;
1680                         }
1681                     }
1682
1683                     // extra byte
1684                     charCount--;
1685                 }
1686 #endif // FASTLOOP
1687
1688                 // no pending bits at this point
1689                 ch = 0;
1690                 continue;
1691
1692             BadLongCode:
1693                 pSrc -= 2;
1694                 ch = 0;
1695                 continue;
1696             }
1697
1698             // May have a problem if we have to flush
1699             if (ch != 0)
1700             {
1701                 // We were already adjusting for these, so need to unadjust
1702                 charCount += (ch >> 30);
1703                 if (baseDecoder == null || baseDecoder.MustFlush)
1704                 {
1705                     // Have to do fallback for invalid bytes
1706                     if (fallback == null)
1707                     {
1708                         if (baseDecoder == null)
1709                             fallback = this.decoderFallback.CreateFallbackBuffer();
1710                         else
1711                             fallback = baseDecoder.FallbackBuffer;
1712                         fallback.InternalInitialize(bytes, null);
1713                     }
1714                     charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1715                 }
1716             }
1717
1718             // Shouldn't have anything in fallback buffer for GetCharCount
1719             // (don't have to check m_throwOnOverflow for count)
1720             Debug.Assert(fallback == null || fallback.Remaining == 0,
1721                 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1722
1723             return charCount;
1724         }
1725
1726         // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
1727         //           So if we're really broken, then that could also throw an error... recursively.
1728         //           So try to make sure GetChars can at least process all uses by
1729         //           System.Resources.ResourceReader!
1730         //
1731         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1732         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1733         //
1734         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1735         // kept the same as much as possible
1736         internal override unsafe int GetChars(byte* bytes, int byteCount,
1737                                                 char* chars, int charCount, DecoderNLS baseDecoder)
1738         {
1739             Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
1740             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
1741             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1742             Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
1743
1744             byte* pSrc = bytes;
1745             char* pTarget = chars;
1746
1747             byte* pEnd = pSrc + byteCount;
1748             char* pAllocatedBufferEnd = pTarget + charCount;
1749
1750             int ch = 0;
1751
1752             DecoderFallbackBuffer fallback = null;
1753             byte* pSrcForFallback;
1754             char* pTargetForFallback;
1755             if (baseDecoder != null)
1756             {
1757                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1758                 ch = decoder.bits;
1759
1760                 // Shouldn't have anything in fallback buffer for GetChars
1761                 // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
1762                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1763                     "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1764             }
1765
1766             for (;;)
1767             {
1768                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1769
1770                 if (pSrc >= pEnd)
1771                 {
1772                     break;
1773                 }
1774
1775                 if (ch == 0)
1776                 {
1777                     // no pending bits
1778                     goto ReadChar;
1779                 }
1780
1781                 // read next byte. The JIT optimization seems to be getting confused when
1782                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1783                 int cha = *pSrc;
1784                 pSrc++;
1785
1786                 // we are expecting to see trailing bytes like 10vvvvvv
1787                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1788                 {
1789                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1790                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1791                     pSrc--;
1792                     goto InvalidByteSequence;
1793                 }
1794
1795                 // fold in the new byte
1796                 ch = (ch << 6) | (cha & 0x3F);
1797
1798                 if ((ch & FinalByte) == 0)
1799                 {
1800                     // Not at last byte yet
1801                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1802                         "[UTF8Encoding.GetChars]Invariant volation");
1803
1804                     if ((ch & SupplimentarySeq) != 0)
1805                     {
1806                         // Its a 4-byte supplimentary sequence
1807                         if ((ch & (FinalByte >> 6)) != 0)
1808                         {
1809                             // this is 3rd byte of 4 byte sequence - nothing to do
1810                             continue;
1811                         }
1812
1813                         // 2nd byte of 4 bytes
1814                         // check for non-shortest form of surrogate and the valid surrogate
1815                         // range 0x000000 - 0x10FFFF at the same time
1816                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1817                         {
1818                             goto InvalidByteSequence;
1819                         }
1820                     }
1821                     else
1822                     {
1823                         // Must be 2nd byte of a 3-byte sequence
1824                         // check for non-shortest form of 3 byte seq
1825                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1826                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1827                         {
1828                             goto InvalidByteSequence;
1829                         }
1830                     }
1831                     continue;
1832                 }
1833
1834                 // ready to punch
1835
1836                 // surrogate in shortest form?
1837                 // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1838                 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
1839                 {
1840                     // let the range check for the second char throw the exception
1841                     if (pTarget < pAllocatedBufferEnd)
1842                     {
1843                         *pTarget = (char)(((ch >> 10) & 0x7FF) +
1844                             unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
1845                         pTarget++;
1846
1847                         ch = (ch & 0x3FF) +
1848                             unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1849                     }
1850                 }
1851
1852                 goto EncodeChar;
1853
1854             InvalidByteSequence:
1855                 // this code fragment should be close to the gotos referencing it
1856                 // Have to do fallback for invalid bytes
1857                 if (fallback == null)
1858                 {
1859                     if (baseDecoder == null)
1860                         fallback = this.decoderFallback.CreateFallbackBuffer();
1861                     else
1862                         fallback = baseDecoder.FallbackBuffer;
1863                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1864                 }
1865                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
1866                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
1867                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
1868                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
1869                 pSrc = pSrcForFallback;
1870                 pTarget = pTargetForFallback;
1871
1872                 if (!fallbackResult)
1873                 {
1874                     // Ran out of buffer space
1875                     // Need to throw an exception?
1876                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1877                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1878                     fallback.InternalReset();
1879                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1880                     ch = 0;
1881                     break;
1882                 }
1883                 Debug.Assert(pSrc >= bytes,
1884                     "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1885                 ch = 0;
1886                 continue;
1887
1888             ReadChar:
1889                 ch = *pSrc;
1890                 pSrc++;
1891
1892             ProcessChar:
1893                 if (ch > 0x7F)
1894                 {
1895                     // If its > 0x7F, its start of a new multi-byte sequence
1896
1897                     // bit 6 has to be non-zero
1898                     if ((ch & 0x40) == 0)
1899                     {
1900                         goto InvalidByteSequence;
1901                     }
1902
1903                     // start a new long code
1904                     if ((ch & 0x20) != 0)
1905                     {
1906                         if ((ch & 0x10) != 0)
1907                         {
1908                             // 4 byte encoding - supplimentary character (2 surrogates)
1909
1910                             ch &= 0x0F;
1911
1912                             // check that bit 4 is zero and the valid supplimentary character
1913                             // range 0x000000 - 0x10FFFF at the same time
1914                             if (ch > 0x04)
1915                             {
1916                                 ch |= 0xf0;
1917                                 goto InvalidByteSequence;
1918                             }
1919
1920                             ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1921                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1922                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1923                         }
1924                         else
1925                         {
1926                             // 3 byte encoding
1927                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1928                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1929                         }
1930                     }
1931                     else
1932                     {
1933                         // 2 byte encoding
1934
1935                         ch &= 0x1F;
1936
1937                         // check for non-shortest form
1938                         if (ch <= 1)
1939                         {
1940                             ch |= 0xc0;
1941                             goto InvalidByteSequence;
1942                         }
1943
1944                         ch |= (FinalByte >> 6);
1945                     }
1946                     continue;
1947                 }
1948
1949             EncodeChar:
1950                 // write the pending character
1951                 if (pTarget >= pAllocatedBufferEnd)
1952                 {
1953                     // Fix chars so we make sure to throw if we didn't output anything
1954                     ch &= 0x1fffff;
1955                     if (ch > 0x7f)
1956                     {
1957                         if (ch > 0x7ff)
1958                         {
1959                             if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1960                                 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1961                             {
1962                                 pSrc--;     // It was 4 bytes
1963                                 pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
1964                             }
1965                             else if (ch > 0xffff)
1966                             {
1967                                 pSrc--;     // It was 4 bytes, nothing was stored
1968                             }
1969                             pSrc--;         // It was at least 3 bytes
1970                         }
1971                         pSrc--;             // It was at least 2 bytes
1972                     }
1973                     pSrc--;
1974
1975                     // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1976                     // a 4 byte sequence alredy)
1977                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1978                         "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1979                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1980
1981                     // Don't store ch in decoder, we already backed up to its start
1982                     ch = 0;
1983
1984                     // Didn't throw, just use this buffer size.
1985                     break;
1986                 }
1987                 *pTarget = (char)ch;
1988                 pTarget++;
1989
1990 #if FASTLOOP
1991                 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1992                 int availableBytes = PtrDiff(pEnd, pSrc);
1993
1994                 // don't fall into the fast decoding loop if we don't have enough bytes
1995                 // Test for availableChars is done because pStop would be <= pTarget.
1996                 if (availableBytes <= 13)
1997                 {
1998                     // we may need as many as 1 character per byte
1999                     if (availableChars < availableBytes)
2000                     {
2001                         // not enough output room.  no pending bits at this point
2002                         ch = 0;
2003                         continue;
2004                     }
2005
2006                     // try to get over the remainder of the ascii characters fast though
2007                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2008                     while (pSrc < pLocalEnd)
2009                     {
2010                         ch = *pSrc;
2011                         pSrc++;
2012
2013                         if (ch > 0x7F)
2014                             goto ProcessChar;
2015
2016                         *pTarget = (char)ch;
2017                         pTarget++;
2018                     }
2019                     // we are done
2020                     ch = 0;
2021                     break;
2022                 }
2023
2024                 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
2025                 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
2026                 if (availableChars < availableBytes)
2027                 {
2028                     availableBytes = availableChars;
2029                 }
2030
2031                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2032                 //  the boundary will be decreased for every non-ASCII character we encounter
2033                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
2034                 char* pStop = pTarget + availableBytes - 7;
2035
2036                 while (pTarget < pStop)
2037                 {
2038                     ch = *pSrc;
2039                     pSrc++;
2040
2041                     if (ch > 0x7F)
2042                     {
2043                         goto LongCode;
2044                     }
2045                     *pTarget = (char)ch;
2046                     pTarget++;
2047
2048                     // get pSrc to be 2-byte aligned
2049                     if ((unchecked((int)pSrc) & 0x1) != 0)
2050                     {
2051                         ch = *pSrc;
2052                         pSrc++;
2053                         if (ch > 0x7F)
2054                         {
2055                             goto LongCode;
2056                         }
2057                         *pTarget = (char)ch;
2058                         pTarget++;
2059                     }
2060
2061                     // get pSrc to be 4-byte aligned
2062                     if ((unchecked((int)pSrc) & 0x2) != 0)
2063                     {
2064                         ch = *(ushort*)pSrc;
2065                         if ((ch & 0x8080) != 0)
2066                         {
2067                             goto LongCodeWithMask16;
2068                         }
2069
2070                         // Unfortunately, this is endianess sensitive
2071 #if BIGENDIAN
2072                         *pTarget = (char)((ch >> 8) & 0x7F);
2073                         pSrc += 2;
2074                         *(pTarget+1) = (char)(ch & 0x7F);
2075                         pTarget += 2;
2076 #else // BIGENDIAN
2077                         *pTarget = (char)(ch & 0x7F);
2078                         pSrc += 2;
2079                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2080                         pTarget += 2;
2081 #endif // BIGENDIAN
2082                     }
2083
2084                     // Run 8 characters at a time!
2085                     while (pTarget < pStop)
2086                     {
2087                         ch = *(int*)pSrc;
2088                         int chb = *(int*)(pSrc + 4);
2089                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
2090                         {
2091                             goto LongCodeWithMask32;
2092                         }
2093
2094                         // Unfortunately, this is endianess sensitive
2095 #if BIGENDIAN
2096                         *pTarget = (char)((ch >> 24) & 0x7F);
2097                         *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2098                         *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2099                         *(pTarget+3) = (char)(ch & 0x7F);
2100                         pSrc += 8;
2101                         *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2102                         *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2103                         *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2104                         *(pTarget+7) = (char)(chb & 0x7F);
2105                         pTarget += 8;
2106 #else // BIGENDIAN
2107                         *pTarget = (char)(ch & 0x7F);
2108                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2109                         *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
2110                         *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
2111                         pSrc += 8;
2112                         *(pTarget + 4) = (char)(chb & 0x7F);
2113                         *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
2114                         *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
2115                         *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
2116                         pTarget += 8;
2117 #endif // BIGENDIAN
2118                     }
2119                     break;
2120
2121 #if BIGENDIAN
2122                 LongCodeWithMask32:
2123                     // be careful about the sign extension
2124                     ch = (int)(((uint)ch) >> 16);
2125                 LongCodeWithMask16:
2126                     ch = (int)(((uint)ch) >> 8);
2127 #else // BIGENDIAN
2128                 LongCodeWithMask32:
2129                 LongCodeWithMask16:
2130                     ch &= 0xFF;
2131 #endif // BIGENDIAN
2132                     pSrc++;
2133                     if (ch <= 0x7F)
2134                     {
2135                         *pTarget = (char)ch;
2136                         pTarget++;
2137                         continue;
2138                     }
2139
2140                 LongCode:
2141                     int chc = *pSrc;
2142                     pSrc++;
2143
2144                     if (
2145                         // bit 6 has to be zero
2146                         (ch & 0x40) == 0 ||
2147                         // we are expecting to see trailing bytes like 10vvvvvv
2148                         (chc & unchecked((sbyte)0xC0)) != 0x80)
2149                     {
2150                         goto BadLongCode;
2151                     }
2152
2153                     chc &= 0x3F;
2154
2155                     // start a new long code
2156                     if ((ch & 0x20) != 0)
2157                     {
2158                         // fold the first two bytes together
2159                         chc |= (ch & 0x0F) << 6;
2160
2161                         if ((ch & 0x10) != 0)
2162                         {
2163                             // 4 byte encoding - surrogate
2164                             ch = *pSrc;
2165                             if (
2166                                 // check that bit 4 is zero, the non-shortest form of surrogate
2167                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2168                                 !InRange(chc >> 4, 0x01, 0x10) ||
2169                                 // we are expecting to see trailing bytes like 10vvvvvv
2170                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2171                             {
2172                                 goto BadLongCode;
2173                             }
2174
2175                             chc = (chc << 6) | (ch & 0x3F);
2176
2177                             ch = *(pSrc + 1);
2178                             // we are expecting to see trailing bytes like 10vvvvvv
2179                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
2180                             {
2181                                 goto BadLongCode;
2182                             }
2183                             pSrc += 2;
2184
2185                             ch = (chc << 6) | (ch & 0x3F);
2186
2187                             *pTarget = (char)(((ch >> 10) & 0x7FF) +
2188                                 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
2189                             pTarget++;
2190
2191                             ch = (ch & 0x3FF) +
2192                                 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2193
2194                             // extra byte, we're already planning 2 chars for 2 of these bytes,
2195                             // but the big loop is testing the target against pStop, so we need
2196                             // to subtract 2 more or we risk overrunning the input.  Subtract 
2197                             // one here and one below.
2198                             pStop--;
2199                         }
2200                         else
2201                         {
2202                             // 3 byte encoding
2203                             ch = *pSrc;
2204                             if (
2205                                 // check for non-shortest form of 3 byte seq
2206                                 (chc & (0x1F << 5)) == 0 ||
2207                                 // Can't have surrogates here.
2208                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2209                                 // we are expecting to see trailing bytes like 10vvvvvv
2210                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2211                             {
2212                                 goto BadLongCode;
2213                             }
2214                             pSrc++;
2215
2216                             ch = (chc << 6) | (ch & 0x3F);
2217
2218                             // extra byte, we're only expecting 1 char for each of these 3 bytes,
2219                             // but the loop is testing the target (not source) against pStop, so
2220                             // we need to subtract 2 more or we risk overrunning the input.
2221                             // Subtract 1 here and one more below
2222                             pStop--;
2223                         }
2224                     }
2225                     else
2226                     {
2227                         // 2 byte encoding
2228
2229                         ch &= 0x1F;
2230
2231                         // check for non-shortest form
2232                         if (ch <= 1)
2233                         {
2234                             goto BadLongCode;
2235                         }
2236                         ch = (ch << 6) | chc;
2237                     }
2238
2239                     *pTarget = (char)ch;
2240                     pTarget++;
2241
2242                     // extra byte, we're only expecting 1 char for each of these 2 bytes,
2243                     // but the loop is testing the target (not source) against pStop.
2244                     // subtract an extra count from pStop so that we don't overrun the input.
2245                     pStop--;
2246                 }
2247 #endif // FASTLOOP
2248
2249                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2250
2251                 // no pending bits at this point
2252                 ch = 0;
2253                 continue;
2254
2255             BadLongCode:
2256                 pSrc -= 2;
2257                 ch = 0;
2258                 continue;
2259             }
2260
2261             if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2262             {
2263                 // Have to do fallback for invalid bytes
2264                 if (fallback == null)
2265                 {
2266                     if (baseDecoder == null)
2267                         fallback = this.decoderFallback.CreateFallbackBuffer();
2268                     else
2269                         fallback = baseDecoder.FallbackBuffer;
2270                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2271                 }
2272
2273                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2274                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
2275                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
2276                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
2277                 pSrc = pSrcForFallback;
2278                 pTarget = pTargetForFallback;
2279
2280                 if (!fallbackResult)
2281                 {
2282                     Debug.Assert(pSrc >= bytes || pTarget == chars,
2283                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2284
2285                     // Ran out of buffer space
2286                     // Need to throw an exception?
2287                     fallback.InternalReset();
2288                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
2289                 }
2290                 Debug.Assert(pSrc >= bytes,
2291                     "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2292                 ch = 0;
2293             }
2294
2295             if (baseDecoder != null)
2296             {
2297                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2298
2299                 // If we're storing flush data we expect all bits to be used or else
2300                 // we're stuck in the middle of a conversion
2301                 Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
2302                     "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2303
2304                 // Remember our leftover bits.
2305                 decoder.bits = ch;
2306
2307                 baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
2308             }
2309
2310             // Shouldn't have anything in fallback buffer for GetChars
2311             // (don't have to check m_throwOnOverflow for chars)
2312             Debug.Assert(fallback == null || fallback.Remaining == 0,
2313                 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2314
2315             return PtrDiff(pTarget, chars);
2316         }
2317
2318         // During GetChars we had an invalid byte sequence
2319         // pSrc is backed up to the start of the bad sequence if we didn't have room to
2320         // fall it back.  Otherwise pSrc remains wher it is.
2321         private unsafe bool FallbackInvalidByteSequence(
2322             ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2323         {
2324             // Get our byte[]
2325             byte* pStart = pSrc;
2326             byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2327
2328             // Do the actual fallback
2329             if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2330             {
2331                 // Oops, it failed, back up to pStart
2332                 pSrc = pStart;
2333                 return false;
2334             }
2335
2336             // It worked
2337             return true;
2338         }
2339
2340         // During GetCharCount we had an invalid byte sequence
2341         // pSrc is used to find the index that points to the invalid bytes,
2342         // however the byte[] contains the fallback bytes (in case the index is -1)
2343         private unsafe int FallbackInvalidByteSequence(
2344             byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2345         {
2346             // Get our byte[]
2347             byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2348
2349             // Do the actual fallback
2350             int count = fallback.InternalFallback(bytesUnknown, pSrc);
2351
2352             // # of fallback chars expected.
2353             // Note that we only get here for "long" sequences, and have already unreserved
2354             // the count that we prereserved for the input bytes
2355             return count;
2356         }
2357
2358         // Note that some of these bytes may have come from a previous fallback, so we cannot
2359         // just decrement the pointer and use the values we read.  In those cases we have 
2360         // to regenerate the original values.
2361         private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2362         {
2363             // Get our byte[]
2364             byte[] bytesUnknown = null;
2365
2366             // See if it was a plain char
2367             // (have to check >= 0 because we have all sorts of wierd bit flags)
2368             if (ch < 0x100 && ch >= 0)
2369             {
2370                 pSrc--;
2371                 bytesUnknown = new byte[] { unchecked((byte)ch) };
2372             }
2373             // See if its an unfinished 2 byte sequence
2374             else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2375             {
2376                 pSrc--;
2377                 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
2378             }
2379             // So now we're either 2nd byte of 3 or 4 byte sequence or
2380             // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2381             // 1st check if its a 4 byte sequence
2382             else if ((ch & SupplimentarySeq) != 0)
2383             {
2384                 //  3rd byte of 4 byte sequence?
2385                 if ((ch & (FinalByte >> 6)) != 0)
2386                 {
2387                     // 3rd byte of 4 byte sequence
2388                     pSrc -= 3;
2389                     bytesUnknown = new byte[] {
2390                         unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2391                         unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2392                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2393                 }
2394                 else if ((ch & (FinalByte >> 12)) != 0)
2395                 {
2396                     // 2nd byte of a 4 byte sequence
2397                     pSrc -= 2;
2398                     bytesUnknown = new byte[] {
2399                         unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2400                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2401                 }
2402                 else
2403                 {
2404                     // 4th byte of a 4 byte sequence
2405                     pSrc--;
2406                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
2407                 }
2408             }
2409             else
2410             {
2411                 // 2nd byte of 3 byte sequence?
2412                 if ((ch & (FinalByte >> 6)) != 0)
2413                 {
2414                     // So its 2nd byte of a 3 byte sequence
2415                     pSrc -= 2;
2416                     bytesUnknown = new byte[] {
2417                         unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2418                 }
2419                 else
2420                 {
2421                     // 1st byte of a 3 byte sequence
2422                     pSrc--;
2423                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
2424                 }
2425             }
2426
2427             return bytesUnknown;
2428         }
2429
2430
2431         public override Decoder GetDecoder()
2432         {
2433             return new UTF8Decoder(this);
2434         }
2435
2436
2437         public override Encoder GetEncoder()
2438         {
2439             return new UTF8Encoder(this);
2440         }
2441
2442
2443         public override int GetMaxByteCount(int charCount)
2444         {
2445             if (charCount < 0)
2446                 throw new ArgumentOutOfRangeException(nameof(charCount),
2447                      SR.ArgumentOutOfRange_NeedNonNegNum);
2448             Contract.EndContractBlock();
2449
2450             // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2451             long byteCount = (long)charCount + 1;
2452
2453             if (EncoderFallback.MaxCharCount > 1)
2454                 byteCount *= EncoderFallback.MaxCharCount;
2455
2456             // Max 3 bytes per char.  (4 bytes per 2 chars for surrogates)
2457             byteCount *= 3;
2458
2459             if (byteCount > 0x7fffffff)
2460                 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
2461
2462             return (int)byteCount;
2463         }
2464
2465
2466         public override int GetMaxCharCount(int byteCount)
2467         {
2468             if (byteCount < 0)
2469                 throw new ArgumentOutOfRangeException(nameof(byteCount),
2470                      SR.ArgumentOutOfRange_NeedNonNegNum);
2471             Contract.EndContractBlock();
2472
2473             // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2474             long charCount = ((long)byteCount + 1);
2475
2476             // Non-shortest form would fall back, so get max count from fallback.
2477             // So would 11... followed by 11..., so you could fall back every byte
2478             if (DecoderFallback.MaxCharCount > 1)
2479             {
2480                 charCount *= DecoderFallback.MaxCharCount;
2481             }
2482
2483             if (charCount > 0x7fffffff)
2484                 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
2485
2486             return (int)charCount;
2487         }
2488
2489
2490         public override byte[] GetPreamble()
2491         {
2492             if (_emitUTF8Identifier)
2493             {
2494                 // Allocate new array to prevent users from modifying it.
2495                 return new byte[3] { 0xEF, 0xBB, 0xBF };
2496             }
2497             else
2498                 return Array.Empty<byte>();
2499         }
2500
2501
2502         public override bool Equals(Object value)
2503         {
2504             UTF8Encoding that = value as UTF8Encoding;
2505             if (that != null)
2506             {
2507                 return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
2508                        (EncoderFallback.Equals(that.EncoderFallback)) &&
2509                        (DecoderFallback.Equals(that.DecoderFallback));
2510             }
2511             return (false);
2512         }
2513
2514
2515         public override int GetHashCode()
2516         {
2517             //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2518             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2519                    UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
2520         }
2521
2522         private sealed class UTF8Encoder : EncoderNLS, ISerializable
2523         {
2524             // We must save a high surrogate value until the next call, looking
2525             // for a low surrogate value.
2526             internal int surrogateChar;
2527
2528             public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2529             {
2530                 // base calls reset
2531             }
2532
2533             // ISerializable implementation
2534             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2535             {
2536                 throw new PlatformNotSupportedException();
2537             }
2538
2539             public override void Reset()
2540
2541             {
2542                 this.surrogateChar = 0;
2543                 if (m_fallbackBuffer != null)
2544                     m_fallbackBuffer.Reset();
2545             }
2546
2547             // Anything left in our encoder?
2548             internal override bool HasState
2549             {
2550                 get
2551                 {
2552                     return (this.surrogateChar != 0);
2553                 }
2554             }
2555         }
2556
2557         private sealed class UTF8Decoder : DecoderNLS, ISerializable
2558         {
2559             // We'll need to remember the previous information. See the comments around definition
2560             // of FinalByte for details.
2561             internal int bits;
2562
2563             public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2564             {
2565                 // base calls reset
2566             }
2567
2568             // Constructor called by serialization, have to handle deserializing from Everett
2569             internal UTF8Decoder(SerializationInfo info, StreamingContext context)
2570             {
2571                 throw new PlatformNotSupportedException();
2572             }
2573
2574             // ISerializable implementation, get data for this object
2575             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2576             {
2577                 throw new PlatformNotSupportedException();
2578             }
2579
2580             public override void Reset()
2581             {
2582                 this.bits = 0;
2583                 if (m_fallbackBuffer != null)
2584                     m_fallbackBuffer.Reset();
2585             }
2586
2587             // Anything left in our decoder?
2588             internal override bool HasState
2589             {
2590                 get
2591                 {
2592                     return (this.bits != 0);
2593                 }
2594             }
2595         }
2596     }
2597 }