src/mscorlib/shared/System/Text/UTF8Encoding.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 // The worker functions in this file was optimized for performance. If you make changes
   6 // you should use care to consider all of the interesting cases.
   7
   8 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
   9 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  10 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  11 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  12
  13 // This define can be used to turn off the fast loops. Useful for finding whether
  14 // the problem is fastloop-specific.
  15 #define FASTLOOP
  16
  17 using System;
  18 using System.Runtime.Serialization;
  19 using System.Diagnostics;
  20 using System.Diagnostics.Contracts;
  21 using System.Globalization;
  22
  23 namespace System.Text
  24 {
  25     // Encodes text into and out of UTF-8.  UTF-8 is a way of writing
  26     // Unicode characters with variable numbers of bytes per character,
  27     // optimized for the lower 127 ASCII characters.  It's an efficient way
  28     // of encoding US English in an internationalizable way.
  29     //
  30     // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  31     //
  32     // The UTF-8 byte order mark is simply the Unicode byte order mark
  33     // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark is
  34     // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  35     // switch the byte orderings.
  36
  37     public class UTF8Encoding : Encoding
  38     {
  39         /*
  40             bytes   bits    UTF-8 representation
  41             -----   ----    -----------------------------------
  42             1        7      0vvvvvvv
  43             2       11      110vvvvv 10vvvvvv
  44             3       16      1110vvvv 10vvvvvv 10vvvvvv
  45             4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  46             -----   ----    -----------------------------------
  47
  48             Surrogate:
  49             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  50         */
  51
  52         private const int UTF8_CODEPAGE = 65001;
  53
  54         // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230)
  55         internal sealed class UTF8EncodingSealed : UTF8Encoding
  56         {
  57             public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
  58         }
  59
  60         // Used by Encoding.UTF8 for lazy initialization
  61         // The initialization code will not be run until a static member of the class is referenced
  62         internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
  63
  64         // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  65         // the standard.
  66         private bool _emitUTF8Identifier = false;
  67
  68         private bool _isThrowException = false;
  69
  70
  71         public UTF8Encoding() : this(false)
  72         {
  73         }
  74
  75
  76         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
  77             this(encoderShouldEmitUTF8Identifier, false)
  78         {
  79         }
  80
  81
  82         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
  83             base(UTF8_CODEPAGE)
  84         {
  85             _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  86             _isThrowException = throwOnInvalidBytes;
  87
  88             // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  89             if (_isThrowException)
  90                 SetDefaultFallbacks();
  91         }
  92
  93         internal override void SetDefaultFallbacks()
  94         {
  95             // For UTF-X encodings, we use a replacement fallback with an empty string
  96             if (_isThrowException)
  97             {
  98                 this.encoderFallback = EncoderFallback.ExceptionFallback;
  99                 this.decoderFallback = DecoderFallback.ExceptionFallback;
 100             }
 101             else
 102             {
 103                 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
 104                 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
 105             }
 106         }
 107
 108
 109         // WARNING: GetByteCount(string chars)
 110         // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
 111         // WARNING: otherwise it'll break VB's way of declaring these.
 112         //
 113         // The following methods are copied from EncodingNLS.cs.
 114         // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
 115         // These should be kept in sync for the following classes:
 116         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 117
 118         // Returns the number of bytes required to encode a range of characters in
 119         // a character array.
 120         //
 121         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 122         // So if you fix this, fix the others.  Currently those include:
 123         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 124         // parent method is safe
 125
 126         public override unsafe int GetByteCount(char[] chars, int index, int count)
 127         {
 128             // Validate input parameters
 129             if (chars == null)
 130                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
 131
 132             if (index < 0 || count < 0)
 133                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 134
 135             if (chars.Length - index < count)
 136                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
 137             Contract.EndContractBlock();
 138
 139             // If no input, return 0, avoid fixed empty array problem
 140             if (count == 0)
 141                 return 0;
 142
 143             // Just call the pointer version
 144             fixed (char* pChars = chars)
 145                 return GetByteCount(pChars + index, count, null);
 146         }
 147
 148         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 149         // So if you fix this, fix the others.  Currently those include:
 150         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 151         // parent method is safe
 152
 153         public override unsafe int GetByteCount(String chars)
 154         {
 155             // Validate input
 156             if (chars==null)
 157                 throw new ArgumentNullException("s");
 158             Contract.EndContractBlock();
 159
 160             fixed (char* pChars = chars)
 161                 return GetByteCount(pChars, chars.Length, null);
 162         }
 163
 164         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 165         // So if you fix this, fix the others.  Currently those include:
 166         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 167
 168         [CLSCompliant(false)]
 169         public override unsafe int GetByteCount(char* chars, int count)
 170         {
 171             // Validate Parameters
 172             if (chars == null)
 173                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
 174
 175             if (count < 0)
 176                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
 177             Contract.EndContractBlock();
 178
 179             // Call it with empty encoder
 180             return GetByteCount(chars, count, null);
 181         }
 182
 183         // Parent method is safe.
 184         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 185         // So if you fix this, fix the others.  Currently those include:
 186         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 187
 188         public override unsafe int GetBytes(String s, int charIndex, int charCount,
 189                                               byte[] bytes, int byteIndex)
 190         {
 191             if (s == null || bytes == null)
 192                 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
 193
 194             if (charIndex < 0 || charCount < 0)
 195                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 196
 197             if (s.Length - charIndex < charCount)
 198                 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
 199
 200             if (byteIndex < 0 || byteIndex > bytes.Length)
 201                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
 202             Contract.EndContractBlock();
 203
 204             int byteCount = bytes.Length - byteIndex;
 205
 206             // Fixed doesn't like 0 length arrays.
 207             if (bytes.Length == 0)
 208                 bytes = new byte[1];
 209
 210             fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
 211                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
 212         }
 213
 214         // Encodes a range of characters in a character array into a range of bytes
 215         // in a byte array. An exception occurs if the byte array is not large
 216         // enough to hold the complete encoding of the characters. The
 217         // GetByteCount method can be used to determine the exact number of
 218         // bytes that will be produced for a given range of characters.
 219         // Alternatively, the GetMaxByteCount method can be used to
 220         // determine the maximum number of bytes that will be produced for a given
 221         // number of characters, regardless of the actual character values.
 222         //
 223         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 224         // So if you fix this, fix the others.  Currently those include:
 225         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 226         // parent method is safe
 227
 228         public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
 229                                                byte[] bytes, int byteIndex)
 230         {
 231             // Validate parameters
 232             if (chars == null || bytes == null)
 233                 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
 234
 235             if (charIndex < 0 || charCount < 0)
 236                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 237
 238             if (chars.Length - charIndex < charCount)
 239                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
 240
 241             if (byteIndex < 0 || byteIndex > bytes.Length)
 242                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
 243             Contract.EndContractBlock();
 244
 245             // If nothing to encode return 0, avoid fixed problem
 246             if (charCount == 0)
 247                 return 0;
 248
 249             // Just call pointer version
 250             int byteCount = bytes.Length - byteIndex;
 251
 252             // Fixed doesn't like 0 length arrays.
 253             if (bytes.Length == 0)
 254                 bytes = new byte[1];
 255
 256             fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
 257                 // Remember that byteCount is # to decode, not size of array.
 258                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
 259         }
 260
 261         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 262         // So if you fix this, fix the others.  Currently those include:
 263         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 264
 265         [CLSCompliant(false)]
 266         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
 267         {
 268             // Validate Parameters
 269             if (bytes == null || chars == null)
 270                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 271
 272             if (charCount < 0 || byteCount < 0)
 273                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 274             Contract.EndContractBlock();
 275
 276             return GetBytes(chars, charCount, bytes, byteCount, null);
 277         }
 278
 279         // Returns the number of characters produced by decoding a range of bytes
 280         // in a byte array.
 281         //
 282         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 283         // So if you fix this, fix the others.  Currently those include:
 284         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 285         // parent method is safe
 286
 287         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
 288         {
 289             // Validate Parameters
 290             if (bytes == null)
 291                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 292
 293             if (index < 0 || count < 0)
 294                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 295
 296             if (bytes.Length - index < count)
 297                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 298             Contract.EndContractBlock();
 299
 300             // If no input just return 0, fixed doesn't like 0 length arrays.
 301             if (count == 0)
 302                 return 0;
 303
 304             // Just call pointer version
 305             fixed (byte* pBytes = bytes)
 306                 return GetCharCount(pBytes + index, count, null);
 307         }
 308
 309         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 310         // So if you fix this, fix the others.  Currently those include:
 311         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 312
 313         [CLSCompliant(false)]
 314         public override unsafe int GetCharCount(byte* bytes, int count)
 315         {
 316             // Validate Parameters
 317             if (bytes == null)
 318                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 319
 320             if (count < 0)
 321                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
 322             Contract.EndContractBlock();
 323
 324             return GetCharCount(bytes, count, null);
 325         }
 326
 327         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 328         // So if you fix this, fix the others.  Currently those include:
 329         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 330         // parent method is safe
 331
 332         public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
 333                                               char[] chars, int charIndex)
 334         {
 335             // Validate Parameters
 336             if (bytes == null || chars == null)
 337                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 338
 339             if (byteIndex < 0 || byteCount < 0)
 340                 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 341
 342             if ( bytes.Length - byteIndex < byteCount)
 343                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 344
 345             if (charIndex < 0 || charIndex > chars.Length)
 346                 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
 347             Contract.EndContractBlock();
 348
 349             // If no input, return 0 & avoid fixed problem
 350             if (byteCount == 0)
 351                 return 0;
 352
 353             // Just call pointer version
 354             int charCount = chars.Length - charIndex;
 355
 356             // Fixed doesn't like 0 length arrays.
 357             if (chars.Length == 0)
 358                 chars = new char[1];
 359
 360             fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
 361                 // Remember that charCount is # to decode, not size of array
 362                 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
 363         }
 364
 365         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 366         // So if you fix this, fix the others.  Currently those include:
 367         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 368
 369         [CLSCompliant(false)]
 370         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
 371         {
 372             // Validate Parameters
 373             if (bytes == null || chars == null)
 374                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 375
 376             if (charCount < 0 || byteCount < 0)
 377                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 378             Contract.EndContractBlock();
 379
 380             return GetChars(bytes, byteCount, chars, charCount, null);
 381         }
 382
 383         // Returns a string containing the decoded representation of a range of
 384         // bytes in a byte array.
 385         //
 386         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 387         // So if you fix this, fix the others.  Currently those include:
 388         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 389         // parent method is safe
 390
 391         public override unsafe String GetString(byte[] bytes, int index, int count)
 392         {
 393             // Validate Parameters
 394             if (bytes == null)
 395                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 396
 397             if (index < 0 || count < 0)
 398                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 399
 400             if (bytes.Length - index < count)
 401                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 402             Contract.EndContractBlock();
 403
 404             // Avoid problems with empty input buffer
 405             if (count == 0) return String.Empty;
 406
 407             fixed (byte* pBytes = bytes)
 408                 return String.CreateStringFromEncoding(
 409                     pBytes + index, count, this);
 410         }
 411
 412         //
 413         // End of standard methods copied from EncodingNLS.cs
 414         //
 415
 416         // To simplify maintenance, the structure of GetByteCount and GetBytes should be
 417         // kept the same as much as possible
 418         internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
 419         {
 420             // For fallback we may need a fallback buffer.
 421             // We wait to initialize it though in case we don't have any broken input unicode
 422             EncoderFallbackBuffer fallbackBuffer = null;
 423             char* pSrcForFallback;
 424
 425             char* pSrc = chars;
 426             char* pEnd = pSrc + count;
 427
 428             // Start by assuming we have as many as count
 429             int byteCount = count;
 430
 431             int ch = 0;
 432
 433             if (baseEncoder != null)
 434             {
 435                 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
 436                 ch = encoder.surrogateChar;
 437
 438                 // We mustn't have left over fallback data when counting
 439                 if (encoder.InternalHasFallbackBuffer)
 440                 {
 441                     fallbackBuffer = encoder.FallbackBuffer;
 442                     if (fallbackBuffer.Remaining > 0)
 443                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
 444
 445                     // Set our internal fallback interesting things.
 446                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
 447                 }
 448             }
 449
 450             for (;;)
 451             {
 452                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 453                 if (pSrc >= pEnd)
 454                 {
 455                     if (ch == 0)
 456                     {
 457                         // Unroll any fallback that happens at the end
 458                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 459                         if (ch > 0)
 460                         {
 461                             byteCount++;
 462                             goto ProcessChar;
 463                         }
 464                     }
 465                     else
 466                     {
 467                         // Case of surrogates in the fallback.
 468                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
 469                         {
 470                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 471                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 472
 473                             ch = fallbackBuffer.InternalGetNextChar();
 474                             byteCount++;
 475
 476                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 477                             {
 478                                 ch = 0xfffd;
 479                                 byteCount++;
 480                                 goto EncodeChar;
 481                             }
 482                             else if (ch > 0)
 483                             {
 484                                 goto ProcessChar;
 485                             }
 486                             else
 487                             {
 488                                 byteCount--; // ignore last one.
 489                                 break;
 490                             }
 491                         }
 492                     }
 493
 494                     if (ch <= 0)
 495                     {
 496                         break;
 497                     }
 498                     if (baseEncoder != null && !baseEncoder.MustFlush)
 499                     {
 500                         break;
 501                     }
 502
 503                     // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
 504                     byteCount++;
 505                     goto EncodeChar;
 506                 }
 507
 508                 if (ch > 0)
 509                 {
 510                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 511                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 512
 513                     // use separate helper variables for local contexts so that the jit optimizations
 514                     // won't get confused about the variable lifetimes
 515                     int cha = *pSrc;
 516
 517                     // count the pending surrogate
 518                     byteCount++;
 519
 520                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 521                     // if (IsLowSurrogate(cha)) {
 522                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 523                     {
 524                         // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
 525                         ch = 0xfffd;
 526                         //                        ch = cha + (ch << 10) +
 527                         //                            (0x10000
 528                         //                            - CharUnicodeInfo.LOW_SURROGATE_START
 529                         //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
 530
 531                         // Use this next char
 532                         pSrc++;
 533                     }
 534                     // else ch is still high surrogate and encoding will fail (so don't add count)
 535
 536                     // attempt to encode the surrogate or partial surrogate
 537                     goto EncodeChar;
 538                 }
 539
 540                 // If we've used a fallback, then we have to check for it
 541                 if (fallbackBuffer != null)
 542                 {
 543                     ch = fallbackBuffer.InternalGetNextChar();
 544                     if (ch > 0)
 545                     {
 546                         // We have an extra byte we weren't expecting.
 547                         byteCount++;
 548                         goto ProcessChar;
 549                     }
 550                 }
 551
 552                 // read next char. The JIT optimization seems to be getting confused when
 553                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 554                 ch = *pSrc;
 555                 pSrc++;
 556
 557             ProcessChar:
 558                 // if (IsHighSurrogate(ch)) {
 559                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
 560                 {
 561                     // we will count this surrogate next time around
 562                     byteCount--;
 563                     continue;
 564                 }
 565             // either good char or partial surrogate
 566
 567             EncodeChar:
 568                 // throw exception on partial surrogate if necessary
 569                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 570                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 571                 {
 572                     // Lone surrogates aren't allowed
 573                     // Have to make a fallback buffer if we don't have one
 574                     if (fallbackBuffer == null)
 575                     {
 576                         // wait on fallbacks if we can
 577                         // For fallback we may need a fallback buffer
 578                         if (baseEncoder == null)
 579                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 580                         else
 581                             fallbackBuffer = baseEncoder.FallbackBuffer;
 582
 583                         // Set our internal fallback interesting things.
 584                         fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
 585                     }
 586
 587                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 588                     // so the ref pSrc isn't gonna do anything.
 589                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
 590                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
 591                     pSrc = pSrcForFallback;
 592
 593                     // Ignore it if we don't throw (we had preallocated this ch)
 594                     byteCount--;
 595                     ch = 0;
 596                     continue;
 597                 }
 598
 599                 // Count them
 600                 if (ch > 0x7F)
 601                 {
 602                     if (ch > 0x7FF)
 603                     {
 604                         // the extra surrogate byte was compensated by the second surrogate character
 605                         // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
 606                         byteCount++;
 607                     }
 608                     byteCount++;
 609                 }
 610
 611 #if BIT64
 612                 // check for overflow
 613                 if (byteCount < 0)
 614                 {
 615                     break;
 616                 }
 617 #endif
 618
 619 #if FASTLOOP
 620                 // If still have fallback don't do fast loop
 621                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
 622                 {
 623                     // We're reserving 1 byte for each char by default
 624                     byteCount++;
 625                     goto ProcessChar;
 626                 }
 627
 628                 int availableChars = PtrDiff(pEnd, pSrc);
 629
 630                 // don't fall into the fast decoding loop if we don't have enough characters
 631                 if (availableChars <= 13)
 632                 {
 633                     // try to get over the remainder of the ascii characters fast though
 634                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
 635                     while (pSrc < pLocalEnd)
 636                     {
 637                         ch = *pSrc;
 638                         pSrc++;
 639                         if (ch > 0x7F)
 640                             goto ProcessChar;
 641                     }
 642
 643                     // we are done
 644                     break;
 645                 }
 646
 647 #if BIT64
 648                 // make sure that we won't get a silent overflow inside the fast loop
 649                 // (Fall out to slow loop if we have this many characters)
 650                 availableChars &= 0x0FFFFFFF;
 651 #endif
 652
 653                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
 654                 //  the boundary will be decreased for every non-ASCII character we encounter
 655                 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
 656                 char* pStop = pSrc + availableChars - (3 + 4);
 657
 658                 while (pSrc < pStop)
 659                 {
 660                     ch = *pSrc;
 661                     pSrc++;
 662
 663                     if (ch > 0x7F)                                                  // Not ASCII
 664                     {
 665                         if (ch > 0x7FF)                                             // Not 2 Byte
 666                         {
 667                             if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
 668                                 goto LongCode;
 669                             byteCount++;
 670                         }
 671                         byteCount++;
 672                     }
 673
 674                     // get pSrc aligned
 675                     if ((unchecked((int)pSrc) & 0x2) != 0)
 676                     {
 677                         ch = *pSrc;
 678                         pSrc++;
 679                         if (ch > 0x7F)                                              // Not ASCII
 680                         {
 681                             if (ch > 0x7FF)                                         // Not 2 Byte
 682                             {
 683                                 if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
 684                                     goto LongCode;
 685                                 byteCount++;
 686                             }
 687                             byteCount++;
 688                         }
 689                     }
 690
 691                     // Run 2 * 4 characters at a time!
 692                     while (pSrc < pStop)
 693                     {
 694                         ch = *(int*)pSrc;
 695                         int chc = *(int*)(pSrc + 2);
 696                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 697                         {
 698                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 699                             {
 700                                 goto LongCodeWithMask;
 701                             }
 702
 703
 704                             if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
 705                                 byteCount++;
 706                             if ((ch & unchecked((int)0xFF80)) != 0)
 707                                 byteCount++;
 708                             if ((chc & unchecked((int)0xFF800000)) != 0)
 709                                 byteCount++;
 710                             if ((chc & unchecked((int)0xFF80)) != 0)
 711                                 byteCount++;
 712                         }
 713                         pSrc += 4;
 714
 715                         ch = *(int*)pSrc;
 716                         chc = *(int*)(pSrc + 2);
 717                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 718                         {
 719                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 720                             {
 721                                 goto LongCodeWithMask;
 722                             }
 723
 724                             if ((ch & unchecked((int)0xFF800000)) != 0)
 725                                 byteCount++;
 726                             if ((ch & unchecked((int)0xFF80)) != 0)
 727                                 byteCount++;
 728                             if ((chc & unchecked((int)0xFF800000)) != 0)
 729                                 byteCount++;
 730                             if ((chc & unchecked((int)0xFF80)) != 0)
 731                                 byteCount++;
 732                         }
 733                         pSrc += 4;
 734                     }
 735                     break;
 736
 737                 LongCodeWithMask:
 738 #if BIGENDIAN
 739                     // be careful about the sign extension
 740                     ch = (int)(((uint)ch) >> 16);
 741 #else // BIGENDIAN
 742                     ch = (char)ch;
 743 #endif // BIGENDIAN
 744                     pSrc++;
 745
 746                     if (ch <= 0x7F)
 747                     {
 748                         continue;
 749                     }
 750
 751                 LongCode:
 752                     // use separate helper variables for slow and fast loop so that the jit optimizations
 753                     // won't get confused about the variable lifetimes
 754                     if (ch > 0x7FF)
 755                     {
 756                         // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 757                         if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 758                         {
 759                             // 4 byte encoding - high surrogate + low surrogate
 760
 761                             int chd = *pSrc;
 762                             if (
 763                                 // !IsHighSurrogate(ch) // low without high -> bad
 764                                 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
 765                                 // !IsLowSurrogate(chd) // high not followed by low -> bad
 766                                 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 767                             {
 768                                 // Back up and drop out to slow loop to figure out error
 769                                 pSrc--;
 770                                 break;
 771                             }
 772                             pSrc++;
 773
 774                             // byteCount - this byte is compensated by the second surrogate character
 775                         }
 776                         byteCount++;
 777                     }
 778                     byteCount++;
 779
 780                     // byteCount - the last byte is already included
 781                 }
 782 #endif // FASTLOOP
 783
 784                 // no pending char at this point
 785                 ch = 0;
 786             }
 787
 788 #if BIT64
 789             // check for overflow
 790             if (byteCount < 0)
 791             {
 792                 throw new ArgumentException(
 793                         SR.Argument_ConversionOverflow);
 794             }
 795 #endif
 796
 797             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
 798                 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
 799
 800             return byteCount;
 801         }
 802
 803         // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
 804         // is good enough for us, and it tends to generate better code than the signed
 805         // arithmetic generated by default
 806         unsafe private static int PtrDiff(char* a, char* b)
 807         {
 808             return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
 809         }
 810
 811         // byte* flavor just for parity
 812         unsafe private static int PtrDiff(byte* a, byte* b)
 813         {
 814             return (int)(a - b);
 815         }
 816
 817         private static bool InRange(int ch, int start, int end)
 818         {
 819             return (uint)(ch - start) <= (uint)(end - start);
 820         }
 821
 822         // Our workhorse
 823         // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
 824         internal override unsafe int GetBytes(char* chars, int charCount,
 825                                                 byte* bytes, int byteCount, EncoderNLS baseEncoder)
 826         {
 827             Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
 828             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
 829             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
 830             Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
 831
 832             UTF8Encoder encoder = null;
 833
 834             // For fallback we may need a fallback buffer.
 835             // We wait to initialize it though in case we don't have any broken input unicode
 836             EncoderFallbackBuffer fallbackBuffer = null;
 837             char* pSrcForFallback;
 838
 839             char* pSrc = chars;
 840             byte* pTarget = bytes;
 841
 842             char* pEnd = pSrc + charCount;
 843             byte* pAllocatedBufferEnd = pTarget + byteCount;
 844
 845             int ch = 0;
 846
 847             // assume that JIT will enregister pSrc, pTarget and ch
 848
 849             if (baseEncoder != null)
 850             {
 851                 encoder = (UTF8Encoder)baseEncoder;
 852                 ch = encoder.surrogateChar;
 853
 854                 // We mustn't have left over fallback data when counting
 855                 if (encoder.InternalHasFallbackBuffer)
 856                 {
 857                     // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
 858                     fallbackBuffer = encoder.FallbackBuffer;
 859                     if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
 860                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
 861
 862                     // Set our internal fallback interesting things.
 863                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
 864                 }
 865             }
 866
 867             for (;;)
 868             {
 869                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 870
 871                 if (pSrc >= pEnd)
 872                 {
 873                     if (ch == 0)
 874                     {
 875                         // Check if there's anthing left to get out of the fallback buffer
 876                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 877                         if (ch > 0)
 878                         {
 879                             goto ProcessChar;
 880                         }
 881                     }
 882                     else
 883                     {
 884                         // Case of leftover surrogates in the fallback buffer
 885                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
 886                         {
 887                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 888                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 889
 890                             int cha = ch;
 891
 892                             ch = fallbackBuffer.InternalGetNextChar();
 893
 894                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 895                             {
 896                                 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
 897                                 goto EncodeChar;
 898                             }
 899                             else if (ch > 0)
 900                             {
 901                                 goto ProcessChar;
 902                             }
 903                             else
 904                             {
 905                                 break;
 906                             }
 907                         }
 908                     }
 909
 910                     // attempt to encode the partial surrogate (will fail or ignore)
 911                     if (ch > 0 && (encoder == null || encoder.MustFlush))
 912                         goto EncodeChar;
 913
 914                     // We're done
 915                     break;
 916                 }
 917
 918                 if (ch > 0)
 919                 {
 920                     // We have a high surrogate left over from a previous loop.
 921                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 922                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 923
 924                     // use separate helper variables for local contexts so that the jit optimizations
 925                     // won't get confused about the variable lifetimes
 926                     int cha = *pSrc;
 927
 928                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 929                     // if (IsLowSurrogate(cha)) {
 930                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 931                     {
 932                         ch = cha + (ch << 10) +
 933                             (0x10000
 934                             - CharUnicodeInfo.LOW_SURROGATE_START
 935                             - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
 936
 937                         pSrc++;
 938                     }
 939                     // else ch is still high surrogate and encoding will fail
 940
 941                     // attempt to encode the surrogate or partial surrogate
 942                     goto EncodeChar;
 943                 }
 944
 945                 // If we've used a fallback, then we have to check for it
 946                 if (fallbackBuffer != null)
 947                 {
 948                     ch = fallbackBuffer.InternalGetNextChar();
 949                     if (ch > 0) goto ProcessChar;
 950                 }
 951
 952                 // read next char. The JIT optimization seems to be getting confused when
 953                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 954                 ch = *pSrc;
 955                 pSrc++;
 956
 957             ProcessChar:
 958                 // if (IsHighSurrogate(ch)) {
 959                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
 960                 {
 961                     continue;
 962                 }
 963             // either good char or partial surrogate
 964
 965             EncodeChar:
 966                 // throw exception on partial surrogate if necessary
 967                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 968                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 969                 {
 970                     // Lone surrogates aren't allowed, we have to do fallback for them
 971                     // Have to make a fallback buffer if we don't have one
 972                     if (fallbackBuffer == null)
 973                     {
 974                         // wait on fallbacks if we can
 975                         // For fallback we may need a fallback buffer
 976                         if (baseEncoder == null)
 977                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 978                         else
 979                             fallbackBuffer = baseEncoder.FallbackBuffer;
 980
 981                         // Set our internal fallback interesting things.
 982                         fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
 983                     }
 984
 985                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 986                     // so the ref pSrc isn't gonna do anything.
 987                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
 988                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
 989                     pSrc = pSrcForFallback;
 990
 991                     // Ignore it if we don't throw
 992                     ch = 0;
 993                     continue;
 994                 }
 995
 996                 // Count bytes needed
 997                 int bytesNeeded = 1;
 998                 if (ch > 0x7F)
 999                 {
1000                     if (ch > 0x7FF)
1001                     {
1002                         if (ch > 0xFFFF)
1003                         {
1004                             bytesNeeded++;  // 4 bytes (surrogate pair)
1005                         }
1006                         bytesNeeded++;      // 3 bytes (800-FFFF)
1007                     }
1008                     bytesNeeded++;          // 2 bytes (80-7FF)
1009                 }
1010
1011                 if (pTarget > pAllocatedBufferEnd - bytesNeeded)
1012                 {
1013                     // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1014                     if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1015                     {
1016                         fallbackBuffer.MovePrevious();              // Didn't use this fallback char
1017                         if (ch > 0xFFFF)
1018                             fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
1019                     }
1020                     else
1021                     {
1022                         pSrc--;                                     // Didn't use this char
1023                         if (ch > 0xFFFF)
1024                             pSrc--;                                 // Was surrogate, didn't use 2nd part either
1025                     }
1026                     Debug.Assert(pSrc >= chars || pTarget == bytes,
1027                         "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1028                     ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
1029                     ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
1030                     break;
1031                 }
1032
1033                 if (ch <= 0x7F)
1034                 {
1035                     *pTarget = (byte)ch;
1036                 }
1037                 else
1038                 {
1039                     // use separate helper variables for local contexts so that the jit optimizations
1040                     // won't get confused about the variable lifetimes
1041                     int chb;
1042                     if (ch <= 0x7FF)
1043                     {
1044                         // 2 byte encoding
1045                         chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1046                     }
1047                     else
1048                     {
1049                         if (ch <= 0xFFFF)
1050                         {
1051                             chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1052                         }
1053                         else
1054                         {
1055                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1056                             pTarget++;
1057
1058                             chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1059                         }
1060                         *pTarget = (byte)chb;
1061                         pTarget++;
1062
1063                         chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1064                     }
1065                     *pTarget = (byte)chb;
1066                     pTarget++;
1067
1068                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1069                 }
1070                 pTarget++;
1071
1072
1073 #if FASTLOOP
1074                 // If still have fallback don't do fast loop
1075                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1076                     goto ProcessChar;
1077
1078                 int availableChars = PtrDiff(pEnd, pSrc);
1079                 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1080
1081                 // don't fall into the fast decoding loop if we don't have enough characters
1082                 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1083                 if (availableChars <= 13)
1084                 {
1085                     // we are hoping for 1 byte per char
1086                     if (availableBytes < availableChars)
1087                     {
1088                         // not enough output room.  no pending bits at this point
1089                         ch = 0;
1090                         continue;
1091                     }
1092
1093                     // try to get over the remainder of the ascii characters fast though
1094                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1095                     while (pSrc < pLocalEnd)
1096                     {
1097                         ch = *pSrc;
1098                         pSrc++;
1099
1100                         // Not ASCII, need more than 1 byte per char
1101                         if (ch > 0x7F)
1102                             goto ProcessChar;
1103
1104                         *pTarget = (byte)ch;
1105                         pTarget++;
1106                     }
1107                     // we are done, let ch be 0 to clear encoder
1108                     ch = 0;
1109                     break;
1110                 }
1111
1112                 // we need at least 1 byte per character, but Convert might allow us to convert
1113                 // only part of the input, so try as much as we can.  Reduce charCount if necessary
1114                 if (availableBytes < availableChars)
1115                 {
1116                     availableChars = availableBytes;
1117                 }
1118
1119                 // FASTLOOP:
1120                 // - optimistic range checks
1121                 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1122
1123                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1124                 //  the boundary will be decreased for every non-ASCII character we encounter
1125                 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1126                 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1127                 char* pStop = pSrc + availableChars - 5;
1128
1129                 while (pSrc < pStop)
1130                 {
1131                     ch = *pSrc;
1132                     pSrc++;
1133
1134                     if (ch > 0x7F)
1135                     {
1136                         goto LongCode;
1137                     }
1138                     *pTarget = (byte)ch;
1139                     pTarget++;
1140
1141                     // get pSrc aligned
1142                     if ((unchecked((int)pSrc) & 0x2) != 0)
1143                     {
1144                         ch = *pSrc;
1145                         pSrc++;
1146                         if (ch > 0x7F)
1147                         {
1148                             goto LongCode;
1149                         }
1150                         *pTarget = (byte)ch;
1151                         pTarget++;
1152                     }
1153
1154                     // Run 4 characters at a time!
1155                     while (pSrc < pStop)
1156                     {
1157                         ch = *(int*)pSrc;
1158                         int chc = *(int*)(pSrc + 2);
1159                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
1160                         {
1161                             goto LongCodeWithMask;
1162                         }
1163
1164                         // Unfortunately, this is endianess sensitive
1165 #if BIGENDIAN
1166                         *pTarget = (byte)(ch>>16);
1167                         *(pTarget+1) = (byte)ch;
1168                         pSrc += 4;
1169                         *(pTarget+2) = (byte)(chc>>16);
1170                         *(pTarget+3) = (byte)chc;
1171                         pTarget += 4;
1172 #else // BIGENDIAN
1173                         *pTarget = (byte)ch;
1174                         *(pTarget + 1) = (byte)(ch >> 16);
1175                         pSrc += 4;
1176                         *(pTarget + 2) = (byte)chc;
1177                         *(pTarget + 3) = (byte)(chc >> 16);
1178                         pTarget += 4;
1179 #endif // BIGENDIAN
1180                     }
1181                     continue;
1182
1183                 LongCodeWithMask:
1184 #if BIGENDIAN
1185                     // be careful about the sign extension
1186                     ch = (int)(((uint)ch) >> 16);
1187 #else // BIGENDIAN
1188                     ch = (char)ch;
1189 #endif // BIGENDIAN
1190                     pSrc++;
1191
1192                     if (ch > 0x7F)
1193                     {
1194                         goto LongCode;
1195                     }
1196                     *pTarget = (byte)ch;
1197                     pTarget++;
1198                     continue;
1199
1200                 LongCode:
1201                     // use separate helper variables for slow and fast loop so that the jit optimizations
1202                     // won't get confused about the variable lifetimes
1203                     int chd;
1204                     if (ch <= 0x7FF)
1205                     {
1206                         // 2 byte encoding
1207                         chd = unchecked((sbyte)0xC0) | (ch >> 6);
1208                     }
1209                     else
1210                     {
1211                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1212                         if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1213                         {
1214                             // 3 byte encoding
1215                             chd = unchecked((sbyte)0xE0) | (ch >> 12);
1216                         }
1217                         else
1218                         {
1219                             // 4 byte encoding - high surrogate + low surrogate
1220                             // if (!IsHighSurrogate(ch))
1221                             if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
1222                             {
1223                                 // low without high -> bad, try again in slow loop
1224                                 pSrc -= 1;
1225                                 break;
1226                             }
1227
1228                             chd = *pSrc;
1229                             pSrc++;
1230
1231                             // if (!IsLowSurrogate(chd)) {
1232                             if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1233                             {
1234                                 // high not followed by low -> bad, try again in slow loop
1235                                 pSrc -= 2;
1236                                 break;
1237                             }
1238
1239                             ch = chd + (ch << 10) +
1240                                 (0x10000
1241                                 - CharUnicodeInfo.LOW_SURROGATE_START
1242                                 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
1243
1244                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1245                             // pStop - this byte is compensated by the second surrogate character
1246                             // 2 input chars require 4 output bytes.  2 have been anticipated already
1247                             // and 2 more will be accounted for by the 2 pStop-- calls below.
1248                             pTarget++;
1249
1250                             chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1251                         }
1252                         *pTarget = (byte)chd;
1253                         pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1254                         pTarget++;
1255
1256                         chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1257                     }
1258                     *pTarget = (byte)chd;
1259                     pStop--;                        // 2 byte sequence for 1 char so need pStop--.
1260                     pTarget++;
1261
1262                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1263                     // pStop - this byte is already included
1264                     pTarget++;
1265                 }
1266
1267                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1268
1269 #endif // FASTLOOP
1270
1271                 // no pending char at this point
1272                 ch = 0;
1273             }
1274
1275             // Do we have to set the encoder bytes?
1276             if (encoder != null)
1277             {
1278                 Debug.Assert(!encoder.MustFlush || ch == 0,
1279                     "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1280
1281                 encoder.surrogateChar = ch;
1282                 encoder.m_charsUsed = (int)(pSrc - chars);
1283             }
1284
1285             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1286                 baseEncoder == null || !baseEncoder.m_throwOnOverflow,
1287                 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1288
1289             return (int)(pTarget - bytes);
1290         }
1291
1292
1293         // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1294         // while the actual character is being built in the lower bits. They are shifted together
1295         // with the actual bits of the character.
1296
1297         // bits 30 & 31 are used for pending bits fixup
1298         private const int FinalByte = 1 << 29;
1299         private const int SupplimentarySeq = 1 << 28;
1300         private const int ThreeByteSeq = 1 << 27;
1301
1302         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1303         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1304         //
1305         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1306         // kept the same as much as possible
1307         internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1308         {
1309             Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1310             Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
1311
1312             // Initialize stuff
1313             byte* pSrc = bytes;
1314             byte* pEnd = pSrc + count;
1315
1316             // Start by assuming we have as many as count, charCount always includes the adjustment
1317             // for the character being decoded
1318             int charCount = count;
1319             int ch = 0;
1320             DecoderFallbackBuffer fallback = null;
1321
1322             if (baseDecoder != null)
1323             {
1324                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1325                 ch = decoder.bits;
1326                 charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
1327
1328                 // Shouldn't have anything in fallback buffer for GetCharCount
1329                 // (don't have to check m_throwOnOverflow for count)
1330                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1331                     "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1332             }
1333
1334             for (;;)
1335             {
1336                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1337
1338                 if (pSrc >= pEnd)
1339                 {
1340                     break;
1341                 }
1342
1343                 if (ch == 0)
1344                 {
1345                     // no pending bits
1346                     goto ReadChar;
1347                 }
1348
1349                 // read next byte. The JIT optimization seems to be getting confused when
1350                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1351                 int cha = *pSrc;
1352                 pSrc++;
1353
1354                 // we are expecting to see trailing bytes like 10vvvvvv
1355                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1356                 {
1357                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1358                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1359                     pSrc--;
1360                     charCount += (ch >> 30);
1361                     goto InvalidByteSequence;
1362                 }
1363
1364                 // fold in the new byte
1365                 ch = (ch << 6) | (cha & 0x3F);
1366
1367                 if ((ch & FinalByte) == 0)
1368                 {
1369                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1370                         "[UTF8Encoding.GetChars]Invariant volation");
1371
1372                     if ((ch & SupplimentarySeq) != 0)
1373                     {
1374                         if ((ch & (FinalByte >> 6)) != 0)
1375                         {
1376                             // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1377                             continue;
1378                         }
1379
1380                         // 2nd byte, check for non-shortest form of supplimentary char and the valid
1381                         // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1382                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1383                         {
1384                             goto InvalidByteSequence;
1385                         }
1386                     }
1387                     else
1388                     {
1389                         // Must be 2nd byte of a 3-byte sequence
1390                         // check for non-shortest form of 3 byte seq
1391                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1392                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1393                         {
1394                             goto InvalidByteSequence;
1395                         }
1396                     }
1397                     continue;
1398                 }
1399
1400                 // ready to punch
1401
1402                 // adjust for surrogates in non-shortest form
1403                 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
1404                 {
1405                     charCount--;
1406                 }
1407                 goto EncodeChar;
1408
1409             InvalidByteSequence:
1410                 // this code fragment should be close to the gotos referencing it
1411                 // Have to do fallback for invalid bytes
1412                 if (fallback == null)
1413                 {
1414                     if (baseDecoder == null)
1415                         fallback = this.decoderFallback.CreateFallbackBuffer();
1416                     else
1417                         fallback = baseDecoder.FallbackBuffer;
1418                     fallback.InternalInitialize(bytes, null);
1419                 }
1420                 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1421
1422                 ch = 0;
1423                 continue;
1424
1425             ReadChar:
1426                 ch = *pSrc;
1427                 pSrc++;
1428
1429             ProcessChar:
1430                 if (ch > 0x7F)
1431                 {
1432                     // If its > 0x7F, its start of a new multi-byte sequence
1433
1434                     // Long sequence, so unreserve our char.
1435                     charCount--;
1436
1437                     // bit 6 has to be non-zero for start of multibyte chars.
1438                     if ((ch & 0x40) == 0)
1439                     {
1440                         // Unexpected trail byte
1441                         goto InvalidByteSequence;
1442                     }
1443
1444                     // start a new long code
1445                     if ((ch & 0x20) != 0)
1446                     {
1447                         if ((ch & 0x10) != 0)
1448                         {
1449                             // 4 byte encoding - supplimentary character (2 surrogates)
1450
1451                             ch &= 0x0F;
1452
1453                             // check that bit 4 is zero and the valid supplimentary character
1454                             // range 0x000000 - 0x10FFFF at the same time
1455                             if (ch > 0x04)
1456                             {
1457                                 ch |= 0xf0;
1458                                 goto InvalidByteSequence;
1459                             }
1460
1461                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1462                             // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1463                             ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
1464                                   (1 << 30) |           // If it dies on next byte we'll need an extra char
1465                                   (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
1466                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1467                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1468
1469                             // Our character count will be 2 characters for these 4 bytes, so subtract another char
1470                             charCount--;
1471                         }
1472                         else
1473                         {
1474                             // 3 byte encoding
1475                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1476                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1477                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1478
1479                             // We'll expect 1 character for these 3 bytes, so subtract another char.
1480                             charCount--;
1481                         }
1482                     }
1483                     else
1484                     {
1485                         // 2 byte encoding
1486
1487                         ch &= 0x1F;
1488
1489                         // check for non-shortest form
1490                         if (ch <= 1)
1491                         {
1492                             ch |= 0xc0;
1493                             goto InvalidByteSequence;
1494                         }
1495
1496                         // Add bit flags so we'll be flagged correctly
1497                         ch |= (FinalByte >> 6);
1498                     }
1499                     continue;
1500                 }
1501
1502             EncodeChar:
1503
1504 #if FASTLOOP
1505                 int availableBytes = PtrDiff(pEnd, pSrc);
1506
1507                 // don't fall into the fast decoding loop if we don't have enough bytes
1508                 if (availableBytes <= 13)
1509                 {
1510                     // try to get over the remainder of the ascii characters fast though
1511                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1512                     while (pSrc < pLocalEnd)
1513                     {
1514                         ch = *pSrc;
1515                         pSrc++;
1516
1517                         if (ch > 0x7F)
1518                             goto ProcessChar;
1519                     }
1520                     // we are done
1521                     ch = 0;
1522                     break;
1523                 }
1524
1525                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1526                 //  the boundary will be decreased for every non-ASCII character we encounter
1527                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1528                 byte* pStop = pSrc + availableBytes - 7;
1529
1530                 while (pSrc < pStop)
1531                 {
1532                     ch = *pSrc;
1533                     pSrc++;
1534
1535                     if (ch > 0x7F)
1536                     {
1537                         goto LongCode;
1538                     }
1539
1540                     // get pSrc 2-byte aligned
1541                     if ((unchecked((int)pSrc) & 0x1) != 0)
1542                     {
1543                         ch = *pSrc;
1544                         pSrc++;
1545                         if (ch > 0x7F)
1546                         {
1547                             goto LongCode;
1548                         }
1549                     }
1550
1551                     // get pSrc 4-byte aligned
1552                     if ((unchecked((int)pSrc) & 0x2) != 0)
1553                     {
1554                         ch = *(ushort*)pSrc;
1555                         if ((ch & 0x8080) != 0)
1556                         {
1557                             goto LongCodeWithMask16;
1558                         }
1559                         pSrc += 2;
1560                     }
1561
1562                     // Run 8 + 8 characters at a time!
1563                     while (pSrc < pStop)
1564                     {
1565                         ch = *(int*)pSrc;
1566                         int chb = *(int*)(pSrc + 4);
1567                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1568                         {
1569                             goto LongCodeWithMask32;
1570                         }
1571                         pSrc += 8;
1572
1573                         // This is a really small loop - unroll it
1574                         if (pSrc >= pStop)
1575                             break;
1576
1577                         ch = *(int*)pSrc;
1578                         chb = *(int*)(pSrc + 4);
1579                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1580                         {
1581                             goto LongCodeWithMask32;
1582                         }
1583                         pSrc += 8;
1584                     }
1585                     break;
1586
1587 #if BIGENDIAN
1588                 LongCodeWithMask32:
1589                     // be careful about the sign extension
1590                     ch = (int)(((uint)ch) >> 16);
1591                 LongCodeWithMask16:
1592                     ch = (int)(((uint)ch) >> 8);
1593 #else // BIGENDIAN
1594                 LongCodeWithMask32:
1595                 LongCodeWithMask16:
1596                     ch &= 0xFF;
1597 #endif // BIGENDIAN
1598                     pSrc++;
1599                     if (ch <= 0x7F)
1600                     {
1601                         continue;
1602                     }
1603
1604                 LongCode:
1605                     int chc = *pSrc;
1606                     pSrc++;
1607
1608                     if (
1609                         // bit 6 has to be zero
1610                         (ch & 0x40) == 0 ||
1611                         // we are expecting to see trailing bytes like 10vvvvvv
1612                         (chc & unchecked((sbyte)0xC0)) != 0x80)
1613                     {
1614                         goto BadLongCode;
1615                     }
1616
1617                     chc &= 0x3F;
1618
1619                     // start a new long code
1620                     if ((ch & 0x20) != 0)
1621                     {
1622                         // fold the first two bytes together
1623                         chc |= (ch & 0x0F) << 6;
1624
1625                         if ((ch & 0x10) != 0)
1626                         {
1627                             // 4 byte encoding - surrogate
1628                             ch = *pSrc;
1629                             if (
1630                                 // check that bit 4 is zero, the non-shortest form of surrogate
1631                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1632                                 !InRange(chc >> 4, 0x01, 0x10) ||
1633                                 // we are expecting to see trailing bytes like 10vvvvvv
1634                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1635                             {
1636                                 goto BadLongCode;
1637                             }
1638
1639                             chc = (chc << 6) | (ch & 0x3F);
1640
1641                             ch = *(pSrc + 1);
1642                             // we are expecting to see trailing bytes like 10vvvvvv
1643                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
1644                             {
1645                                 goto BadLongCode;
1646                             }
1647                             pSrc += 2;
1648
1649                             // extra byte
1650                             charCount--;
1651                         }
1652                         else
1653                         {
1654                             // 3 byte encoding
1655                             ch = *pSrc;
1656                             if (
1657                                 // check for non-shortest form of 3 byte seq
1658                                 (chc & (0x1F << 5)) == 0 ||
1659                                 // Can't have surrogates here.
1660                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1661                                 // we are expecting to see trailing bytes like 10vvvvvv
1662                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1663                             {
1664                                 goto BadLongCode;
1665                             }
1666                             pSrc++;
1667
1668                             // extra byte
1669                             charCount--;
1670                         }
1671                     }
1672                     else
1673                     {
1674                         // 2 byte encoding
1675
1676                         // check for non-shortest form
1677                         if ((ch & 0x1E) == 0)
1678                         {
1679                             goto BadLongCode;
1680                         }
1681                     }
1682
1683                     // extra byte
1684                     charCount--;
1685                 }
1686 #endif // FASTLOOP
1687
1688                 // no pending bits at this point
1689                 ch = 0;
1690                 continue;
1691
1692             BadLongCode:
1693                 pSrc -= 2;
1694                 ch = 0;
1695                 continue;
1696             }
1697
1698             // May have a problem if we have to flush
1699             if (ch != 0)
1700             {
1701                 // We were already adjusting for these, so need to unadjust
1702                 charCount += (ch >> 30);
1703                 if (baseDecoder == null || baseDecoder.MustFlush)
1704                 {
1705                     // Have to do fallback for invalid bytes
1706                     if (fallback == null)
1707                     {
1708                         if (baseDecoder == null)
1709                             fallback = this.decoderFallback.CreateFallbackBuffer();
1710                         else
1711                             fallback = baseDecoder.FallbackBuffer;
1712                         fallback.InternalInitialize(bytes, null);
1713                     }
1714                     charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1715                 }
1716             }
1717
1718             // Shouldn't have anything in fallback buffer for GetCharCount
1719             // (don't have to check m_throwOnOverflow for count)
1720             Debug.Assert(fallback == null || fallback.Remaining == 0,
1721                 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1722
1723             return charCount;
1724         }
1725
1726         // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
1727         //           So if we're really broken, then that could also throw an error... recursively.
1728         //           So try to make sure GetChars can at least process all uses by
1729         //           System.Resources.ResourceReader!
1730         //
1731         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1732         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1733         //
1734         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1735         // kept the same as much as possible
1736         internal override unsafe int GetChars(byte* bytes, int byteCount,
1737                                                 char* chars, int charCount, DecoderNLS baseDecoder)
1738         {
1739             Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
1740             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
1741             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1742             Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
1743
1744             byte* pSrc = bytes;
1745             char* pTarget = chars;
1746
1747             byte* pEnd = pSrc + byteCount;
1748             char* pAllocatedBufferEnd = pTarget + charCount;
1749
1750             int ch = 0;
1751
1752             DecoderFallbackBuffer fallback = null;
1753             byte* pSrcForFallback;
1754             char* pTargetForFallback;
1755             if (baseDecoder != null)
1756             {
1757                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1758                 ch = decoder.bits;
1759
1760                 // Shouldn't have anything in fallback buffer for GetChars
1761                 // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
1762                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1763                     "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1764             }
1765
1766             for (;;)
1767             {
1768                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1769
1770                 if (pSrc >= pEnd)
1771                 {
1772                     break;
1773                 }
1774
1775                 if (ch == 0)
1776                 {
1777                     // no pending bits
1778                     goto ReadChar;
1779                 }
1780
1781                 // read next byte. The JIT optimization seems to be getting confused when
1782                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1783                 int cha = *pSrc;
1784                 pSrc++;
1785
1786                 // we are expecting to see trailing bytes like 10vvvvvv
1787                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1788                 {
1789                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1790                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1791                     pSrc--;
1792                     goto InvalidByteSequence;
1793                 }
1794
1795                 // fold in the new byte
1796                 ch = (ch << 6) | (cha & 0x3F);
1797
1798                 if ((ch & FinalByte) == 0)
1799                 {
1800                     // Not at last byte yet
1801                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1802                         "[UTF8Encoding.GetChars]Invariant volation");
1803
1804                     if ((ch & SupplimentarySeq) != 0)
1805                     {
1806                         // Its a 4-byte supplimentary sequence
1807                         if ((ch & (FinalByte >> 6)) != 0)
1808                         {
1809                             // this is 3rd byte of 4 byte sequence - nothing to do
1810                             continue;
1811                         }
1812
1813                         // 2nd byte of 4 bytes
1814                         // check for non-shortest form of surrogate and the valid surrogate
1815                         // range 0x000000 - 0x10FFFF at the same time
1816                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1817                         {
1818                             goto InvalidByteSequence;
1819                         }
1820                     }
1821                     else
1822                     {
1823                         // Must be 2nd byte of a 3-byte sequence
1824                         // check for non-shortest form of 3 byte seq
1825                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1826                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1827                         {
1828                             goto InvalidByteSequence;
1829                         }
1830                     }
1831                     continue;
1832                 }
1833
1834                 // ready to punch
1835
1836                 // surrogate in shortest form?
1837                 // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1838                 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
1839                 {
1840                     // let the range check for the second char throw the exception
1841                     if (pTarget < pAllocatedBufferEnd)
1842                     {
1843                         *pTarget = (char)(((ch >> 10) & 0x7FF) +
1844                             unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
1845                         pTarget++;
1846
1847                         ch = (ch & 0x3FF) +
1848                             unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1849                     }
1850                 }
1851
1852                 goto EncodeChar;
1853
1854             InvalidByteSequence:
1855                 // this code fragment should be close to the gotos referencing it
1856                 // Have to do fallback for invalid bytes
1857                 if (fallback == null)
1858                 {
1859                     if (baseDecoder == null)
1860                         fallback = this.decoderFallback.CreateFallbackBuffer();
1861                     else
1862                         fallback = baseDecoder.FallbackBuffer;
1863                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1864                 }
1865                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
1866                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
1867                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
1868                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
1869                 pSrc = pSrcForFallback;
1870                 pTarget = pTargetForFallback;
1871
1872                 if (!fallbackResult)
1873                 {
1874                     // Ran out of buffer space
1875                     // Need to throw an exception?
1876                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1877                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1878                     fallback.InternalReset();
1879                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1880                     ch = 0;
1881                     break;
1882                 }
1883                 Debug.Assert(pSrc >= bytes,
1884                     "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1885                 ch = 0;
1886                 continue;
1887
1888             ReadChar:
1889                 ch = *pSrc;
1890                 pSrc++;
1891
1892             ProcessChar:
1893                 if (ch > 0x7F)
1894                 {
1895                     // If its > 0x7F, its start of a new multi-byte sequence
1896
1897                     // bit 6 has to be non-zero
1898                     if ((ch & 0x40) == 0)
1899                     {
1900                         goto InvalidByteSequence;
1901                     }
1902
1903                     // start a new long code
1904                     if ((ch & 0x20) != 0)
1905                     {
1906                         if ((ch & 0x10) != 0)
1907                         {
1908                             // 4 byte encoding - supplimentary character (2 surrogates)
1909
1910                             ch &= 0x0F;
1911
1912                             // check that bit 4 is zero and the valid supplimentary character
1913                             // range 0x000000 - 0x10FFFF at the same time
1914                             if (ch > 0x04)
1915                             {
1916                                 ch |= 0xf0;
1917                                 goto InvalidByteSequence;
1918                             }
1919
1920                             ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1921                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1922                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1923                         }
1924                         else
1925                         {
1926                             // 3 byte encoding
1927                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1928                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1929                         }
1930                     }
1931                     else
1932                     {
1933                         // 2 byte encoding
1934
1935                         ch &= 0x1F;
1936
1937                         // check for non-shortest form
1938                         if (ch <= 1)
1939                         {
1940                             ch |= 0xc0;
1941                             goto InvalidByteSequence;
1942                         }
1943
1944                         ch |= (FinalByte >> 6);
1945                     }
1946                     continue;
1947                 }
1948
1949             EncodeChar:
1950                 // write the pending character
1951                 if (pTarget >= pAllocatedBufferEnd)
1952                 {
1953                     // Fix chars so we make sure to throw if we didn't output anything
1954                     ch &= 0x1fffff;
1955                     if (ch > 0x7f)
1956                     {
1957                         if (ch > 0x7ff)
1958                         {
1959                             if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1960                                 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1961                             {
1962                                 pSrc--;     // It was 4 bytes
1963                                 pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
1964                             }
1965                             else if (ch > 0xffff)
1966                             {
1967                                 pSrc--;     // It was 4 bytes, nothing was stored
1968                             }
1969                             pSrc--;         // It was at least 3 bytes
1970                         }
1971                         pSrc--;             // It was at least 2 bytes
1972                     }
1973                     pSrc--;
1974
1975                     // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1976                     // a 4 byte sequence alredy)
1977                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1978                         "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1979                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1980
1981                     // Don't store ch in decoder, we already backed up to its start
1982                     ch = 0;
1983
1984                     // Didn't throw, just use this buffer size.
1985                     break;
1986                 }
1987                 *pTarget = (char)ch;
1988                 pTarget++;
1989
1990 #if FASTLOOP
1991                 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1992                 int availableBytes = PtrDiff(pEnd, pSrc);
1993
1994                 // don't fall into the fast decoding loop if we don't have enough bytes
1995                 // Test for availableChars is done because pStop would be <= pTarget.
1996                 if (availableBytes <= 13)
1997                 {
1998                     // we may need as many as 1 character per byte
1999                     if (availableChars < availableBytes)
2000                     {
2001                         // not enough output room.  no pending bits at this point
2002                         ch = 0;
2003                         continue;
2004                     }
2005
2006                     // try to get over the remainder of the ascii characters fast though
2007                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2008                     while (pSrc < pLocalEnd)
2009                     {
2010                         ch = *pSrc;
2011                         pSrc++;
2012
2013                         if (ch > 0x7F)
2014                             goto ProcessChar;
2015
2016                         *pTarget = (char)ch;
2017                         pTarget++;
2018                     }
2019                     // we are done
2020                     ch = 0;
2021                     break;
2022                 }
2023
2024                 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
2025                 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
2026                 if (availableChars < availableBytes)
2027                 {
2028                     availableBytes = availableChars;
2029                 }
2030
2031                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2032                 //  the boundary will be decreased for every non-ASCII character we encounter
2033                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
2034                 char* pStop = pTarget + availableBytes - 7;
2035
2036                 while (pTarget < pStop)
2037                 {
2038                     ch = *pSrc;
2039                     pSrc++;
2040
2041                     if (ch > 0x7F)
2042                     {
2043                         goto LongCode;
2044                     }
2045                     *pTarget = (char)ch;
2046                     pTarget++;
2047
2048                     // get pSrc to be 2-byte aligned
2049                     if ((unchecked((int)pSrc) & 0x1) != 0)
2050                     {
2051                         ch = *pSrc;
2052                         pSrc++;
2053                         if (ch > 0x7F)
2054                         {
2055                             goto LongCode;
2056                         }
2057                         *pTarget = (char)ch;
2058                         pTarget++;
2059                     }
2060
2061                     // get pSrc to be 4-byte aligned
2062                     if ((unchecked((int)pSrc) & 0x2) != 0)
2063                     {
2064                         ch = *(ushort*)pSrc;
2065                         if ((ch & 0x8080) != 0)
2066                         {
2067                             goto LongCodeWithMask16;
2068                         }
2069
2070                         // Unfortunately, this is endianess sensitive
2071 #if BIGENDIAN
2072                         *pTarget = (char)((ch >> 8) & 0x7F);
2073                         pSrc += 2;
2074                         *(pTarget+1) = (char)(ch & 0x7F);
2075                         pTarget += 2;
2076 #else // BIGENDIAN
2077                         *pTarget = (char)(ch & 0x7F);
2078                         pSrc += 2;
2079                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2080                         pTarget += 2;
2081 #endif // BIGENDIAN
2082                     }
2083
2084                     // Run 8 characters at a time!
2085                     while (pTarget < pStop)
2086                     {
2087                         ch = *(int*)pSrc;
2088                         int chb = *(int*)(pSrc + 4);
2089                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
2090                         {
2091                             goto LongCodeWithMask32;
2092                         }
2093
2094                         // Unfortunately, this is endianess sensitive
2095 #if BIGENDIAN
2096                         *pTarget = (char)((ch >> 24) & 0x7F);
2097                         *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2098                         *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2099                         *(pTarget+3) = (char)(ch & 0x7F);
2100                         pSrc += 8;
2101                         *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2102                         *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2103                         *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2104                         *(pTarget+7) = (char)(chb & 0x7F);
2105                         pTarget += 8;
2106 #else // BIGENDIAN
2107                         *pTarget = (char)(ch & 0x7F);
2108                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2109                         *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
2110                         *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
2111                         pSrc += 8;
2112                         *(pTarget + 4) = (char)(chb & 0x7F);
2113                         *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
2114                         *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
2115                         *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
2116                         pTarget += 8;
2117 #endif // BIGENDIAN
2118                     }
2119                     break;
2120
2121 #if BIGENDIAN
2122                 LongCodeWithMask32:
2123                     // be careful about the sign extension
2124                     ch = (int)(((uint)ch) >> 16);
2125                 LongCodeWithMask16:
2126                     ch = (int)(((uint)ch) >> 8);
2127 #else // BIGENDIAN
2128                 LongCodeWithMask32:
2129                 LongCodeWithMask16:
2130                     ch &= 0xFF;
2131 #endif // BIGENDIAN
2132                     pSrc++;
2133                     if (ch <= 0x7F)
2134                     {
2135                         *pTarget = (char)ch;
2136                         pTarget++;
2137                         continue;
2138                     }
2139
2140                 LongCode:
2141                     int chc = *pSrc;
2142                     pSrc++;
2143
2144                     if (
2145                         // bit 6 has to be zero
2146                         (ch & 0x40) == 0 ||
2147                         // we are expecting to see trailing bytes like 10vvvvvv
2148                         (chc & unchecked((sbyte)0xC0)) != 0x80)
2149                     {
2150                         goto BadLongCode;
2151                     }
2152
2153                     chc &= 0x3F;
2154
2155                     // start a new long code
2156                     if ((ch & 0x20) != 0)
2157                     {
2158                         // fold the first two bytes together
2159                         chc |= (ch & 0x0F) << 6;
2160
2161                         if ((ch & 0x10) != 0)
2162                         {
2163                             // 4 byte encoding - surrogate
2164                             ch = *pSrc;
2165                             if (
2166                                 // check that bit 4 is zero, the non-shortest form of surrogate
2167                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2168                                 !InRange(chc >> 4, 0x01, 0x10) ||
2169                                 // we are expecting to see trailing bytes like 10vvvvvv
2170                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2171                             {
2172                                 goto BadLongCode;
2173                             }
2174
2175                             chc = (chc << 6) | (ch & 0x3F);
2176
2177                             ch = *(pSrc + 1);
2178                             // we are expecting to see trailing bytes like 10vvvvvv
2179                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
2180                             {
2181                                 goto BadLongCode;
2182                             }
2183                             pSrc += 2;
2184
2185                             ch = (chc << 6) | (ch & 0x3F);
2186
2187                             *pTarget = (char)(((ch >> 10) & 0x7FF) +
2188                                 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
2189                             pTarget++;
2190
2191                             ch = (ch & 0x3FF) +
2192                                 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2193
2194                             // extra byte, we're already planning 2 chars for 2 of these bytes,
2195                             // but the big loop is testing the target against pStop, so we need
2196                             // to subtract 2 more or we risk overrunning the input.  Subtract
2197                             // one here and one below.
2198                             pStop--;
2199                         }
2200                         else
2201                         {
2202                             // 3 byte encoding
2203                             ch = *pSrc;
2204                             if (
2205                                 // check for non-shortest form of 3 byte seq
2206                                 (chc & (0x1F << 5)) == 0 ||
2207                                 // Can't have surrogates here.
2208                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2209                                 // we are expecting to see trailing bytes like 10vvvvvv
2210                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2211                             {
2212                                 goto BadLongCode;
2213                             }
2214                             pSrc++;
2215
2216                             ch = (chc << 6) | (ch & 0x3F);
2217
2218                             // extra byte, we're only expecting 1 char for each of these 3 bytes,
2219                             // but the loop is testing the target (not source) against pStop, so
2220                             // we need to subtract 2 more or we risk overrunning the input.
2221                             // Subtract 1 here and one more below
2222                             pStop--;
2223                         }
2224                     }
2225                     else
2226                     {
2227                         // 2 byte encoding
2228
2229                         ch &= 0x1F;
2230
2231                         // check for non-shortest form
2232                         if (ch <= 1)
2233                         {
2234                             goto BadLongCode;
2235                         }
2236                         ch = (ch << 6) | chc;
2237                     }
2238
2239                     *pTarget = (char)ch;
2240                     pTarget++;
2241
2242                     // extra byte, we're only expecting 1 char for each of these 2 bytes,
2243                     // but the loop is testing the target (not source) against pStop.
2244                     // subtract an extra count from pStop so that we don't overrun the input.
2245                     pStop--;
2246                 }
2247 #endif // FASTLOOP
2248
2249                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2250
2251                 // no pending bits at this point
2252                 ch = 0;
2253                 continue;
2254
2255             BadLongCode:
2256                 pSrc -= 2;
2257                 ch = 0;
2258                 continue;
2259             }
2260
2261             if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2262             {
2263                 // Have to do fallback for invalid bytes
2264                 if (fallback == null)
2265                 {
2266                     if (baseDecoder == null)
2267                         fallback = this.decoderFallback.CreateFallbackBuffer();
2268                     else
2269                         fallback = baseDecoder.FallbackBuffer;
2270                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2271                 }
2272
2273                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2274                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
2275                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
2276                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
2277                 pSrc = pSrcForFallback;
2278                 pTarget = pTargetForFallback;
2279
2280                 if (!fallbackResult)
2281                 {
2282                     Debug.Assert(pSrc >= bytes || pTarget == chars,
2283                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2284
2285                     // Ran out of buffer space
2286                     // Need to throw an exception?
2287                     fallback.InternalReset();
2288                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
2289                 }
2290                 Debug.Assert(pSrc >= bytes,
2291                     "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2292                 ch = 0;
2293             }
2294
2295             if (baseDecoder != null)
2296             {
2297                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2298
2299                 // If we're storing flush data we expect all bits to be used or else
2300                 // we're stuck in the middle of a conversion
2301                 Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
2302                     "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2303
2304                 // Remember our leftover bits.
2305                 decoder.bits = ch;
2306
2307                 baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
2308             }
2309
2310             // Shouldn't have anything in fallback buffer for GetChars
2311             // (don't have to check m_throwOnOverflow for chars)
2312             Debug.Assert(fallback == null || fallback.Remaining == 0,
2313                 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2314
2315             return PtrDiff(pTarget, chars);
2316         }
2317
2318         // During GetChars we had an invalid byte sequence
2319         // pSrc is backed up to the start of the bad sequence if we didn't have room to
2320         // fall it back.  Otherwise pSrc remains wher it is.
2321         private unsafe bool FallbackInvalidByteSequence(
2322             ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2323         {
2324             // Get our byte[]
2325             byte* pStart = pSrc;
2326             byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2327
2328             // Do the actual fallback
2329             if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2330             {
2331                 // Oops, it failed, back up to pStart
2332                 pSrc = pStart;
2333                 return false;
2334             }
2335
2336             // It worked
2337             return true;
2338         }
2339
2340         // During GetCharCount we had an invalid byte sequence
2341         // pSrc is used to find the index that points to the invalid bytes,
2342         // however the byte[] contains the fallback bytes (in case the index is -1)
2343         private unsafe int FallbackInvalidByteSequence(
2344             byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2345         {
2346             // Get our byte[]
2347             byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2348
2349             // Do the actual fallback
2350             int count = fallback.InternalFallback(bytesUnknown, pSrc);
2351
2352             // # of fallback chars expected.
2353             // Note that we only get here for "long" sequences, and have already unreserved
2354             // the count that we prereserved for the input bytes
2355             return count;
2356         }
2357
2358         // Note that some of these bytes may have come from a previous fallback, so we cannot
2359         // just decrement the pointer and use the values we read.  In those cases we have
2360         // to regenerate the original values.
2361         private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2362         {
2363             // Get our byte[]
2364             byte[] bytesUnknown = null;
2365
2366             // See if it was a plain char
2367             // (have to check >= 0 because we have all sorts of wierd bit flags)
2368             if (ch < 0x100 && ch >= 0)
2369             {
2370                 pSrc--;
2371                 bytesUnknown = new byte[] { unchecked((byte)ch) };
2372             }
2373             // See if its an unfinished 2 byte sequence
2374             else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2375             {
2376                 pSrc--;
2377                 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
2378             }
2379             // So now we're either 2nd byte of 3 or 4 byte sequence or
2380             // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2381             // 1st check if its a 4 byte sequence
2382             else if ((ch & SupplimentarySeq) != 0)
2383             {
2384                 //  3rd byte of 4 byte sequence?
2385                 if ((ch & (FinalByte >> 6)) != 0)
2386                 {
2387                     // 3rd byte of 4 byte sequence
2388                     pSrc -= 3;
2389                     bytesUnknown = new byte[] {
2390                         unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2391                         unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2392                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2393                 }
2394                 else if ((ch & (FinalByte >> 12)) != 0)
2395                 {
2396                     // 2nd byte of a 4 byte sequence
2397                     pSrc -= 2;
2398                     bytesUnknown = new byte[] {
2399                         unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2400                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2401                 }
2402                 else
2403                 {
2404                     // 4th byte of a 4 byte sequence
2405                     pSrc--;
2406                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
2407                 }
2408             }
2409             else
2410             {
2411                 // 2nd byte of 3 byte sequence?
2412                 if ((ch & (FinalByte >> 6)) != 0)
2413                 {
2414                     // So its 2nd byte of a 3 byte sequence
2415                     pSrc -= 2;
2416                     bytesUnknown = new byte[] {
2417                         unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2418                 }
2419                 else
2420                 {
2421                     // 1st byte of a 3 byte sequence
2422                     pSrc--;
2423                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
2424                 }
2425             }
2426
2427             return bytesUnknown;
2428         }
2429
2430
2431         public override Decoder GetDecoder()
2432         {
2433             return new UTF8Decoder(this);
2434         }
2435
2436
2437         public override Encoder GetEncoder()
2438         {
2439             return new UTF8Encoder(this);
2440         }
2441
2442
2443         public override int GetMaxByteCount(int charCount)
2444         {
2445             if (charCount < 0)
2446                 throw new ArgumentOutOfRangeException(nameof(charCount),
2447                      SR.ArgumentOutOfRange_NeedNonNegNum);
2448             Contract.EndContractBlock();
2449
2450             // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2451             long byteCount = (long)charCount + 1;
2452
2453             if (EncoderFallback.MaxCharCount > 1)
2454                 byteCount *= EncoderFallback.MaxCharCount;
2455
2456             // Max 3 bytes per char.  (4 bytes per 2 chars for surrogates)
2457             byteCount *= 3;
2458
2459             if (byteCount > 0x7fffffff)
2460                 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
2461
2462             return (int)byteCount;
2463         }
2464
2465
2466         public override int GetMaxCharCount(int byteCount)
2467         {
2468             if (byteCount < 0)
2469                 throw new ArgumentOutOfRangeException(nameof(byteCount),
2470                      SR.ArgumentOutOfRange_NeedNonNegNum);
2471             Contract.EndContractBlock();
2472
2473             // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2474             long charCount = ((long)byteCount + 1);
2475
2476             // Non-shortest form would fall back, so get max count from fallback.
2477             // So would 11... followed by 11..., so you could fall back every byte
2478             if (DecoderFallback.MaxCharCount > 1)
2479             {
2480                 charCount *= DecoderFallback.MaxCharCount;
2481             }
2482
2483             if (charCount > 0x7fffffff)
2484                 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
2485
2486             return (int)charCount;
2487         }
2488
2489
2490         public override byte[] GetPreamble()
2491         {
2492             if (_emitUTF8Identifier)
2493             {
2494                 // Allocate new array to prevent users from modifying it.
2495                 return new byte[3] { 0xEF, 0xBB, 0xBF };
2496             }
2497             else
2498                 return Array.Empty<byte>();
2499         }
2500
2501
2502         public override bool Equals(Object value)
2503         {
2504             UTF8Encoding that = value as UTF8Encoding;
2505             if (that != null)
2506             {
2507                 return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
2508                        (EncoderFallback.Equals(that.EncoderFallback)) &&
2509                        (DecoderFallback.Equals(that.DecoderFallback));
2510             }
2511             return (false);
2512         }
2513
2514
2515         public override int GetHashCode()
2516         {
2517             //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2518             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2519                    UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
2520         }
2521
2522         private sealed class UTF8Encoder : EncoderNLS, ISerializable
2523         {
2524             // We must save a high surrogate value until the next call, looking
2525             // for a low surrogate value.
2526             internal int surrogateChar;
2527
2528             public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2529             {
2530                 // base calls reset
2531             }
2532
2533             // ISerializable implementation
2534             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2535             {
2536                 throw new PlatformNotSupportedException();
2537             }
2538
2539             public override void Reset()
2540
2541             {
2542                 this.surrogateChar = 0;
2543                 if (m_fallbackBuffer != null)
2544                     m_fallbackBuffer.Reset();
2545             }
2546
2547             // Anything left in our encoder?
2548             internal override bool HasState
2549             {
2550                 get
2551                 {
2552                     return (this.surrogateChar != 0);
2553                 }
2554             }
2555         }
2556
2557         private sealed class UTF8Decoder : DecoderNLS, ISerializable
2558         {
2559             // We'll need to remember the previous information. See the comments around definition
2560             // of FinalByte for details.
2561             internal int bits;
2562
2563             public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2564             {
2565                 // base calls reset
2566             }
2567
2568             // Constructor called by serialization, have to handle deserializing from Everett
2569             internal UTF8Decoder(SerializationInfo info, StreamingContext context)
2570             {
2571                 throw new PlatformNotSupportedException();
2572             }
2573
2574             // ISerializable implementation, get data for this object
2575             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2576             {
2577                 throw new PlatformNotSupportedException();
2578             }
2579
2580             public override void Reset()
2581             {
2582                 this.bits = 0;
2583                 if (m_fallbackBuffer != null)
2584                     m_fallbackBuffer.Reset();
2585             }
2586
2587             // Anything left in our decoder?
2588             internal override bool HasState
2589             {
2590                 get
2591                 {
2592                     return (this.bits != 0);
2593                 }
2594             }
2595         }
2596     }
2597 }