src/mscorlib/shared/System/Text/UTF8Encoding.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 // The worker functions in this file was optimized for performance. If you make changes
   6 // you should use care to consider all of the interesting cases.
   7
   8 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
   9 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  10 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  11 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  12
  13 // This define can be used to turn off the fast loops. Useful for finding whether
  14 // the problem is fastloop-specific.
  15 #define FASTLOOP
  16
  17 using System;
  18 using System.Diagnostics;
  19 using System.Diagnostics.Contracts;
  20 using System.Globalization;
  21
  22 namespace System.Text
  23 {
  24     // Encodes text into and out of UTF-8.  UTF-8 is a way of writing
  25     // Unicode characters with variable numbers of bytes per character,
  26     // optimized for the lower 127 ASCII characters.  It's an efficient way
  27     // of encoding US English in an internationalizable way.
  28     //
  29     // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  30     //
  31     // The UTF-8 byte order mark is simply the Unicode byte order mark
  32     // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark is
  33     // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  34     // switch the byte orderings.
  35
  36     public class UTF8Encoding : Encoding
  37     {
  38         /*
  39             bytes   bits    UTF-8 representation
  40             -----   ----    -----------------------------------
  41             1        7      0vvvvvvv
  42             2       11      110vvvvv 10vvvvvv
  43             3       16      1110vvvv 10vvvvvv 10vvvvvv
  44             4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  45             -----   ----    -----------------------------------
  46
  47             Surrogate:
  48             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  49         */
  50
  51         private const int UTF8_CODEPAGE = 65001;
  52
  53         // Allow for de-virtualization (see https://github.com/dotnet/coreclr/pull/9230)
  54         internal sealed class UTF8EncodingSealed : UTF8Encoding
  55         {
  56             public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
  57         }
  58
  59         // Used by Encoding.UTF8 for lazy initialization
  60         // The initialization code will not be run until a static member of the class is referenced
  61         internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
  62
  63         // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  64         // the standard.
  65         private bool _emitUTF8Identifier = false;
  66
  67         private bool _isThrowException = false;
  68
  69
  70         public UTF8Encoding() : this(false)
  71         {
  72         }
  73
  74
  75         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
  76             this(encoderShouldEmitUTF8Identifier, false)
  77         {
  78         }
  79
  80
  81         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
  82             base(UTF8_CODEPAGE)
  83         {
  84             _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  85             _isThrowException = throwOnInvalidBytes;
  86
  87             // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  88             if (_isThrowException)
  89                 SetDefaultFallbacks();
  90         }
  91
  92         internal override void SetDefaultFallbacks()
  93         {
  94             // For UTF-X encodings, we use a replacement fallback with an empty string
  95             if (_isThrowException)
  96             {
  97                 this.encoderFallback = EncoderFallback.ExceptionFallback;
  98                 this.decoderFallback = DecoderFallback.ExceptionFallback;
  99             }
 100             else
 101             {
 102                 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
 103                 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
 104             }
 105         }
 106
 107
 108         // WARNING: GetByteCount(string chars)
 109         // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
 110         // WARNING: otherwise it'll break VB's way of declaring these.
 111         //
 112         // The following methods are copied from EncodingNLS.cs.
 113         // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
 114         // These should be kept in sync for the following classes:
 115         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 116
 117         // Returns the number of bytes required to encode a range of characters in
 118         // a character array.
 119         //
 120         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 121         // So if you fix this, fix the others.  Currently those include:
 122         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 123         // parent method is safe
 124
 125         public override unsafe int GetByteCount(char[] chars, int index, int count)
 126         {
 127             // Validate input parameters
 128             if (chars == null)
 129                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
 130
 131             if (index < 0 || count < 0)
 132                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 133
 134             if (chars.Length - index < count)
 135                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
 136             Contract.EndContractBlock();
 137
 138             // If no input, return 0, avoid fixed empty array problem
 139             if (count == 0)
 140                 return 0;
 141
 142             // Just call the pointer version
 143             fixed (char* pChars = chars)
 144                 return GetByteCount(pChars + index, count, null);
 145         }
 146
 147         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 148         // So if you fix this, fix the others.  Currently those include:
 149         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 150         // parent method is safe
 151
 152         public override unsafe int GetByteCount(String chars)
 153         {
 154             // Validate input
 155             if (chars==null)
 156                 throw new ArgumentNullException("s");
 157             Contract.EndContractBlock();
 158
 159             fixed (char* pChars = chars)
 160                 return GetByteCount(pChars, chars.Length, null);
 161         }
 162
 163         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 164         // So if you fix this, fix the others.  Currently those include:
 165         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 166
 167         [CLSCompliant(false)]
 168         public override unsafe int GetByteCount(char* chars, int count)
 169         {
 170             // Validate Parameters
 171             if (chars == null)
 172                 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
 173
 174             if (count < 0)
 175                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
 176             Contract.EndContractBlock();
 177
 178             // Call it with empty encoder
 179             return GetByteCount(chars, count, null);
 180         }
 181
 182         // Parent method is safe.
 183         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 184         // So if you fix this, fix the others.  Currently those include:
 185         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 186
 187         public override unsafe int GetBytes(String s, int charIndex, int charCount,
 188                                               byte[] bytes, int byteIndex)
 189         {
 190             if (s == null || bytes == null)
 191                 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
 192
 193             if (charIndex < 0 || charCount < 0)
 194                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 195
 196             if (s.Length - charIndex < charCount)
 197                 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
 198
 199             if (byteIndex < 0 || byteIndex > bytes.Length)
 200                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
 201             Contract.EndContractBlock();
 202
 203             int byteCount = bytes.Length - byteIndex;
 204
 205             // Fixed doesn't like 0 length arrays.
 206             if (bytes.Length == 0)
 207                 bytes = new byte[1];
 208
 209             fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
 210                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
 211         }
 212
 213         // Encodes a range of characters in a character array into a range of bytes
 214         // in a byte array. An exception occurs if the byte array is not large
 215         // enough to hold the complete encoding of the characters. The
 216         // GetByteCount method can be used to determine the exact number of
 217         // bytes that will be produced for a given range of characters.
 218         // Alternatively, the GetMaxByteCount method can be used to
 219         // determine the maximum number of bytes that will be produced for a given
 220         // number of characters, regardless of the actual character values.
 221         //
 222         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 223         // So if you fix this, fix the others.  Currently those include:
 224         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 225         // parent method is safe
 226
 227         public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
 228                                                byte[] bytes, int byteIndex)
 229         {
 230             // Validate parameters
 231             if (chars == null || bytes == null)
 232                 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
 233
 234             if (charIndex < 0 || charCount < 0)
 235                 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 236
 237             if (chars.Length - charIndex < charCount)
 238                 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
 239
 240             if (byteIndex < 0 || byteIndex > bytes.Length)
 241                 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
 242             Contract.EndContractBlock();
 243
 244             // If nothing to encode return 0, avoid fixed problem
 245             if (charCount == 0)
 246                 return 0;
 247
 248             // Just call pointer version
 249             int byteCount = bytes.Length - byteIndex;
 250
 251             // Fixed doesn't like 0 length arrays.
 252             if (bytes.Length == 0)
 253                 bytes = new byte[1];
 254
 255             fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
 256                 // Remember that byteCount is # to decode, not size of array.
 257                 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
 258         }
 259
 260         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 261         // So if you fix this, fix the others.  Currently those include:
 262         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 263
 264         [CLSCompliant(false)]
 265         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
 266         {
 267             // Validate Parameters
 268             if (bytes == null || chars == null)
 269                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 270
 271             if (charCount < 0 || byteCount < 0)
 272                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 273             Contract.EndContractBlock();
 274
 275             return GetBytes(chars, charCount, bytes, byteCount, null);
 276         }
 277
 278         // Returns the number of characters produced by decoding a range of bytes
 279         // in a byte array.
 280         //
 281         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 282         // So if you fix this, fix the others.  Currently those include:
 283         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 284         // parent method is safe
 285
 286         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
 287         {
 288             // Validate Parameters
 289             if (bytes == null)
 290                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 291
 292             if (index < 0 || count < 0)
 293                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 294
 295             if (bytes.Length - index < count)
 296                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 297             Contract.EndContractBlock();
 298
 299             // If no input just return 0, fixed doesn't like 0 length arrays.
 300             if (count == 0)
 301                 return 0;
 302
 303             // Just call pointer version
 304             fixed (byte* pBytes = bytes)
 305                 return GetCharCount(pBytes + index, count, null);
 306         }
 307
 308         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 309         // So if you fix this, fix the others.  Currently those include:
 310         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 311
 312         [CLSCompliant(false)]
 313         public override unsafe int GetCharCount(byte* bytes, int count)
 314         {
 315             // Validate Parameters
 316             if (bytes == null)
 317                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 318
 319             if (count < 0)
 320                 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
 321             Contract.EndContractBlock();
 322
 323             return GetCharCount(bytes, count, null);
 324         }
 325
 326         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 327         // So if you fix this, fix the others.  Currently those include:
 328         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 329         // parent method is safe
 330
 331         public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
 332                                               char[] chars, int charIndex)
 333         {
 334             // Validate Parameters
 335             if (bytes == null || chars == null)
 336                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 337
 338             if (byteIndex < 0 || byteCount < 0)
 339                 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 340
 341             if ( bytes.Length - byteIndex < byteCount)
 342                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 343
 344             if (charIndex < 0 || charIndex > chars.Length)
 345                 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
 346             Contract.EndContractBlock();
 347
 348             // If no input, return 0 & avoid fixed problem
 349             if (byteCount == 0)
 350                 return 0;
 351
 352             // Just call pointer version
 353             int charCount = chars.Length - charIndex;
 354
 355             // Fixed doesn't like 0 length arrays.
 356             if (chars.Length == 0)
 357                 chars = new char[1];
 358
 359             fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
 360                 // Remember that charCount is # to decode, not size of array
 361                 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
 362         }
 363
 364         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 365         // So if you fix this, fix the others.  Currently those include:
 366         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 367
 368         [CLSCompliant(false)]
 369         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
 370         {
 371             // Validate Parameters
 372             if (bytes == null || chars == null)
 373                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
 374
 375             if (charCount < 0 || byteCount < 0)
 376                 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
 377             Contract.EndContractBlock();
 378
 379             return GetChars(bytes, byteCount, chars, charCount, null);
 380         }
 381
 382         // Returns a string containing the decoded representation of a range of
 383         // bytes in a byte array.
 384         //
 385         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 386         // So if you fix this, fix the others.  Currently those include:
 387         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 388         // parent method is safe
 389
 390         public override unsafe String GetString(byte[] bytes, int index, int count)
 391         {
 392             // Validate Parameters
 393             if (bytes == null)
 394                 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
 395
 396             if (index < 0 || count < 0)
 397                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
 398
 399             if (bytes.Length - index < count)
 400                 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
 401             Contract.EndContractBlock();
 402
 403             // Avoid problems with empty input buffer
 404             if (count == 0) return String.Empty;
 405
 406             fixed (byte* pBytes = bytes)
 407                 return String.CreateStringFromEncoding(
 408                     pBytes + index, count, this);
 409         }
 410
 411         //
 412         // End of standard methods copied from EncodingNLS.cs
 413         //
 414
 415         // To simplify maintenance, the structure of GetByteCount and GetBytes should be
 416         // kept the same as much as possible
 417         internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
 418         {
 419             // For fallback we may need a fallback buffer.
 420             // We wait to initialize it though in case we don't have any broken input unicode
 421             EncoderFallbackBuffer fallbackBuffer = null;
 422             char* pSrcForFallback;
 423
 424             char* pSrc = chars;
 425             char* pEnd = pSrc + count;
 426
 427             // Start by assuming we have as many as count
 428             int byteCount = count;
 429
 430             int ch = 0;
 431
 432             if (baseEncoder != null)
 433             {
 434                 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
 435                 ch = encoder.surrogateChar;
 436
 437                 // We mustn't have left over fallback data when counting
 438                 if (encoder.InternalHasFallbackBuffer)
 439                 {
 440                     fallbackBuffer = encoder.FallbackBuffer;
 441                     if (fallbackBuffer.Remaining > 0)
 442                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
 443
 444                     // Set our internal fallback interesting things.
 445                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
 446                 }
 447             }
 448
 449             for (;;)
 450             {
 451                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 452                 if (pSrc >= pEnd)
 453                 {
 454                     if (ch == 0)
 455                     {
 456                         // Unroll any fallback that happens at the end
 457                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 458                         if (ch > 0)
 459                         {
 460                             byteCount++;
 461                             goto ProcessChar;
 462                         }
 463                     }
 464                     else
 465                     {
 466                         // Case of surrogates in the fallback.
 467                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
 468                         {
 469                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 470                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 471
 472                             ch = fallbackBuffer.InternalGetNextChar();
 473                             byteCount++;
 474
 475                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 476                             {
 477                                 ch = 0xfffd;
 478                                 byteCount++;
 479                                 goto EncodeChar;
 480                             }
 481                             else if (ch > 0)
 482                             {
 483                                 goto ProcessChar;
 484                             }
 485                             else
 486                             {
 487                                 byteCount--; // ignore last one.
 488                                 break;
 489                             }
 490                         }
 491                     }
 492
 493                     if (ch <= 0)
 494                     {
 495                         break;
 496                     }
 497                     if (baseEncoder != null && !baseEncoder.MustFlush)
 498                     {
 499                         break;
 500                     }
 501
 502                     // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
 503                     byteCount++;
 504                     goto EncodeChar;
 505                 }
 506
 507                 if (ch > 0)
 508                 {
 509                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 510                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 511
 512                     // use separate helper variables for local contexts so that the jit optimizations
 513                     // won't get confused about the variable lifetimes
 514                     int cha = *pSrc;
 515
 516                     // count the pending surrogate
 517                     byteCount++;
 518
 519                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 520                     // if (IsLowSurrogate(cha)) {
 521                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 522                     {
 523                         // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
 524                         ch = 0xfffd;
 525                         //                        ch = cha + (ch << 10) +
 526                         //                            (0x10000
 527                         //                            - CharUnicodeInfo.LOW_SURROGATE_START
 528                         //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
 529
 530                         // Use this next char
 531                         pSrc++;
 532                     }
 533                     // else ch is still high surrogate and encoding will fail (so don't add count)
 534
 535                     // attempt to encode the surrogate or partial surrogate
 536                     goto EncodeChar;
 537                 }
 538
 539                 // If we've used a fallback, then we have to check for it
 540                 if (fallbackBuffer != null)
 541                 {
 542                     ch = fallbackBuffer.InternalGetNextChar();
 543                     if (ch > 0)
 544                     {
 545                         // We have an extra byte we weren't expecting.
 546                         byteCount++;
 547                         goto ProcessChar;
 548                     }
 549                 }
 550
 551                 // read next char. The JIT optimization seems to be getting confused when
 552                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 553                 ch = *pSrc;
 554                 pSrc++;
 555
 556             ProcessChar:
 557                 // if (IsHighSurrogate(ch)) {
 558                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
 559                 {
 560                     // we will count this surrogate next time around
 561                     byteCount--;
 562                     continue;
 563                 }
 564             // either good char or partial surrogate
 565
 566             EncodeChar:
 567                 // throw exception on partial surrogate if necessary
 568                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 569                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 570                 {
 571                     // Lone surrogates aren't allowed
 572                     // Have to make a fallback buffer if we don't have one
 573                     if (fallbackBuffer == null)
 574                     {
 575                         // wait on fallbacks if we can
 576                         // For fallback we may need a fallback buffer
 577                         if (baseEncoder == null)
 578                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 579                         else
 580                             fallbackBuffer = baseEncoder.FallbackBuffer;
 581
 582                         // Set our internal fallback interesting things.
 583                         fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
 584                     }
 585
 586                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 587                     // so the ref pSrc isn't gonna do anything.
 588                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
 589                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
 590                     pSrc = pSrcForFallback;
 591
 592                     // Ignore it if we don't throw (we had preallocated this ch)
 593                     byteCount--;
 594                     ch = 0;
 595                     continue;
 596                 }
 597
 598                 // Count them
 599                 if (ch > 0x7F)
 600                 {
 601                     if (ch > 0x7FF)
 602                     {
 603                         // the extra surrogate byte was compensated by the second surrogate character
 604                         // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
 605                         byteCount++;
 606                     }
 607                     byteCount++;
 608                 }
 609
 610 #if BIT64
 611                 // check for overflow
 612                 if (byteCount < 0)
 613                 {
 614                     break;
 615                 }
 616 #endif
 617
 618 #if FASTLOOP
 619                 // If still have fallback don't do fast loop
 620                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
 621                 {
 622                     // We're reserving 1 byte for each char by default
 623                     byteCount++;
 624                     goto ProcessChar;
 625                 }
 626
 627                 int availableChars = PtrDiff(pEnd, pSrc);
 628
 629                 // don't fall into the fast decoding loop if we don't have enough characters
 630                 if (availableChars <= 13)
 631                 {
 632                     // try to get over the remainder of the ascii characters fast though
 633                     char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
 634                     while (pSrc < pLocalEnd)
 635                     {
 636                         ch = *pSrc;
 637                         pSrc++;
 638                         if (ch > 0x7F)
 639                             goto ProcessChar;
 640                     }
 641
 642                     // we are done
 643                     break;
 644                 }
 645
 646 #if BIT64
 647                 // make sure that we won't get a silent overflow inside the fast loop
 648                 // (Fall out to slow loop if we have this many characters)
 649                 availableChars &= 0x0FFFFFFF;
 650 #endif
 651
 652                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
 653                 //  the boundary will be decreased for every non-ASCII character we encounter
 654                 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
 655                 char* pStop = pSrc + availableChars - (3 + 4);
 656
 657                 while (pSrc < pStop)
 658                 {
 659                     ch = *pSrc;
 660                     pSrc++;
 661
 662                     if (ch > 0x7F)                                                  // Not ASCII
 663                     {
 664                         if (ch > 0x7FF)                                             // Not 2 Byte
 665                         {
 666                             if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
 667                                 goto LongCode;
 668                             byteCount++;
 669                         }
 670                         byteCount++;
 671                     }
 672
 673                     // get pSrc aligned
 674                     if ((unchecked((int)pSrc) & 0x2) != 0)
 675                     {
 676                         ch = *pSrc;
 677                         pSrc++;
 678                         if (ch > 0x7F)                                              // Not ASCII
 679                         {
 680                             if (ch > 0x7FF)                                         // Not 2 Byte
 681                             {
 682                                 if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
 683                                     goto LongCode;
 684                                 byteCount++;
 685                             }
 686                             byteCount++;
 687                         }
 688                     }
 689
 690                     // Run 2 * 4 characters at a time!
 691                     while (pSrc < pStop)
 692                     {
 693                         ch = *(int*)pSrc;
 694                         int chc = *(int*)(pSrc + 2);
 695                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 696                         {
 697                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 698                             {
 699                                 goto LongCodeWithMask;
 700                             }
 701
 702
 703                             if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
 704                                 byteCount++;
 705                             if ((ch & unchecked((int)0xFF80)) != 0)
 706                                 byteCount++;
 707                             if ((chc & unchecked((int)0xFF800000)) != 0)
 708                                 byteCount++;
 709                             if ((chc & unchecked((int)0xFF80)) != 0)
 710                                 byteCount++;
 711                         }
 712                         pSrc += 4;
 713
 714                         ch = *(int*)pSrc;
 715                         chc = *(int*)(pSrc + 2);
 716                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 717                         {
 718                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 719                             {
 720                                 goto LongCodeWithMask;
 721                             }
 722
 723                             if ((ch & unchecked((int)0xFF800000)) != 0)
 724                                 byteCount++;
 725                             if ((ch & unchecked((int)0xFF80)) != 0)
 726                                 byteCount++;
 727                             if ((chc & unchecked((int)0xFF800000)) != 0)
 728                                 byteCount++;
 729                             if ((chc & unchecked((int)0xFF80)) != 0)
 730                                 byteCount++;
 731                         }
 732                         pSrc += 4;
 733                     }
 734                     break;
 735
 736                 LongCodeWithMask:
 737 #if BIGENDIAN
 738                     // be careful about the sign extension
 739                     ch = (int)(((uint)ch) >> 16);
 740 #else // BIGENDIAN
 741                     ch = (char)ch;
 742 #endif // BIGENDIAN
 743                     pSrc++;
 744
 745                     if (ch <= 0x7F)
 746                     {
 747                         continue;
 748                     }
 749
 750                 LongCode:
 751                     // use separate helper variables for slow and fast loop so that the jit optimizations
 752                     // won't get confused about the variable lifetimes
 753                     if (ch > 0x7FF)
 754                     {
 755                         // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 756                         if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 757                         {
 758                             // 4 byte encoding - high surrogate + low surrogate
 759
 760                             int chd = *pSrc;
 761                             if (
 762                                 // !IsHighSurrogate(ch) // low without high -> bad
 763                                 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
 764                                 // !IsLowSurrogate(chd) // high not followed by low -> bad
 765                                 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 766                             {
 767                                 // Back up and drop out to slow loop to figure out error
 768                                 pSrc--;
 769                                 break;
 770                             }
 771                             pSrc++;
 772
 773                             // byteCount - this byte is compensated by the second surrogate character
 774                         }
 775                         byteCount++;
 776                     }
 777                     byteCount++;
 778
 779                     // byteCount - the last byte is already included
 780                 }
 781 #endif // FASTLOOP
 782
 783                 // no pending char at this point
 784                 ch = 0;
 785             }
 786
 787 #if BIT64
 788             // check for overflow
 789             if (byteCount < 0)
 790             {
 791                 throw new ArgumentException(
 792                         SR.Argument_ConversionOverflow);
 793             }
 794 #endif
 795
 796             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
 797                 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
 798
 799             return byteCount;
 800         }
 801
 802         // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
 803         // is good enough for us, and it tends to generate better code than the signed
 804         // arithmetic generated by default
 805         unsafe private static int PtrDiff(char* a, char* b)
 806         {
 807             return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
 808         }
 809
 810         // byte* flavor just for parity
 811         unsafe private static int PtrDiff(byte* a, byte* b)
 812         {
 813             return (int)(a - b);
 814         }
 815
 816         private static bool InRange(int ch, int start, int end)
 817         {
 818             return (uint)(ch - start) <= (uint)(end - start);
 819         }
 820
 821         // Our workhorse
 822         // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
 823         internal override unsafe int GetBytes(char* chars, int charCount,
 824                                                 byte* bytes, int byteCount, EncoderNLS baseEncoder)
 825         {
 826             Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
 827             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
 828             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
 829             Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
 830
 831             UTF8Encoder encoder = null;
 832
 833             // For fallback we may need a fallback buffer.
 834             // We wait to initialize it though in case we don't have any broken input unicode
 835             EncoderFallbackBuffer fallbackBuffer = null;
 836             char* pSrcForFallback;
 837
 838             char* pSrc = chars;
 839             byte* pTarget = bytes;
 840
 841             char* pEnd = pSrc + charCount;
 842             byte* pAllocatedBufferEnd = pTarget + byteCount;
 843
 844             int ch = 0;
 845
 846             // assume that JIT will en-register pSrc, pTarget and ch
 847
 848             if (baseEncoder != null)
 849             {
 850                 encoder = (UTF8Encoder)baseEncoder;
 851                 ch = encoder.surrogateChar;
 852
 853                 // We mustn't have left over fallback data when counting
 854                 if (encoder.InternalHasFallbackBuffer)
 855                 {
 856                     // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
 857                     fallbackBuffer = encoder.FallbackBuffer;
 858                     if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
 859                         throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
 860
 861                     // Set our internal fallback interesting things.
 862                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
 863                 }
 864             }
 865
 866             for (;;)
 867             {
 868                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 869
 870                 if (pSrc >= pEnd)
 871                 {
 872                     if (ch == 0)
 873                     {
 874                         // Check if there's anthing left to get out of the fallback buffer
 875                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 876                         if (ch > 0)
 877                         {
 878                             goto ProcessChar;
 879                         }
 880                     }
 881                     else
 882                     {
 883                         // Case of leftover surrogates in the fallback buffer
 884                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
 885                         {
 886                             Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 887                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 888
 889                             int cha = ch;
 890
 891                             ch = fallbackBuffer.InternalGetNextChar();
 892
 893                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 894                             {
 895                                 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
 896                                 goto EncodeChar;
 897                             }
 898                             else if (ch > 0)
 899                             {
 900                                 goto ProcessChar;
 901                             }
 902                             else
 903                             {
 904                                 break;
 905                             }
 906                         }
 907                     }
 908
 909                     // attempt to encode the partial surrogate (will fail or ignore)
 910                     if (ch > 0 && (encoder == null || encoder.MustFlush))
 911                         goto EncodeChar;
 912
 913                     // We're done
 914                     break;
 915                 }
 916
 917                 if (ch > 0)
 918                 {
 919                     // We have a high surrogate left over from a previous loop.
 920                     Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 921                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 922
 923                     // use separate helper variables for local contexts so that the jit optimizations
 924                     // won't get confused about the variable lifetimes
 925                     int cha = *pSrc;
 926
 927                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 928                     // if (IsLowSurrogate(cha)) {
 929                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 930                     {
 931                         ch = cha + (ch << 10) +
 932                             (0x10000
 933                             - CharUnicodeInfo.LOW_SURROGATE_START
 934                             - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
 935
 936                         pSrc++;
 937                     }
 938                     // else ch is still high surrogate and encoding will fail
 939
 940                     // attempt to encode the surrogate or partial surrogate
 941                     goto EncodeChar;
 942                 }
 943
 944                 // If we've used a fallback, then we have to check for it
 945                 if (fallbackBuffer != null)
 946                 {
 947                     ch = fallbackBuffer.InternalGetNextChar();
 948                     if (ch > 0) goto ProcessChar;
 949                 }
 950
 951                 // read next char. The JIT optimization seems to be getting confused when
 952                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 953                 ch = *pSrc;
 954                 pSrc++;
 955
 956             ProcessChar:
 957                 // if (IsHighSurrogate(ch)) {
 958                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
 959                 {
 960                     continue;
 961                 }
 962             // either good char or partial surrogate
 963
 964             EncodeChar:
 965                 // throw exception on partial surrogate if necessary
 966                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 967                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 968                 {
 969                     // Lone surrogates aren't allowed, we have to do fallback for them
 970                     // Have to make a fallback buffer if we don't have one
 971                     if (fallbackBuffer == null)
 972                     {
 973                         // wait on fallbacks if we can
 974                         // For fallback we may need a fallback buffer
 975                         if (baseEncoder == null)
 976                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 977                         else
 978                             fallbackBuffer = baseEncoder.FallbackBuffer;
 979
 980                         // Set our internal fallback interesting things.
 981                         fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
 982                     }
 983
 984                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 985                     // so the ref pSrc isn't gonna do anything.
 986                     pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
 987                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
 988                     pSrc = pSrcForFallback;
 989
 990                     // Ignore it if we don't throw
 991                     ch = 0;
 992                     continue;
 993                 }
 994
 995                 // Count bytes needed
 996                 int bytesNeeded = 1;
 997                 if (ch > 0x7F)
 998                 {
 999                     if (ch > 0x7FF)
1000                     {
1001                         if (ch > 0xFFFF)
1002                         {
1003                             bytesNeeded++;  // 4 bytes (surrogate pair)
1004                         }
1005                         bytesNeeded++;      // 3 bytes (800-FFFF)
1006                     }
1007                     bytesNeeded++;          // 2 bytes (80-7FF)
1008                 }
1009
1010                 if (pTarget > pAllocatedBufferEnd - bytesNeeded)
1011                 {
1012                     // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1013                     if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1014                     {
1015                         fallbackBuffer.MovePrevious();              // Didn't use this fallback char
1016                         if (ch > 0xFFFF)
1017                             fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
1018                     }
1019                     else
1020                     {
1021                         pSrc--;                                     // Didn't use this char
1022                         if (ch > 0xFFFF)
1023                             pSrc--;                                 // Was surrogate, didn't use 2nd part either
1024                     }
1025                     Debug.Assert(pSrc >= chars || pTarget == bytes,
1026                         "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1027                     ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
1028                     ch = 0;                                         // Nothing left over (we backed up to start of pair if supplementary)
1029                     break;
1030                 }
1031
1032                 if (ch <= 0x7F)
1033                 {
1034                     *pTarget = (byte)ch;
1035                 }
1036                 else
1037                 {
1038                     // use separate helper variables for local contexts so that the jit optimizations
1039                     // won't get confused about the variable lifetimes
1040                     int chb;
1041                     if (ch <= 0x7FF)
1042                     {
1043                         // 2 byte encoding
1044                         chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1045                     }
1046                     else
1047                     {
1048                         if (ch <= 0xFFFF)
1049                         {
1050                             chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1051                         }
1052                         else
1053                         {
1054                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1055                             pTarget++;
1056
1057                             chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1058                         }
1059                         *pTarget = (byte)chb;
1060                         pTarget++;
1061
1062                         chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1063                     }
1064                     *pTarget = (byte)chb;
1065                     pTarget++;
1066
1067                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1068                 }
1069                 pTarget++;
1070
1071
1072 #if FASTLOOP
1073                 // If still have fallback don't do fast loop
1074                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1075                     goto ProcessChar;
1076
1077                 int availableChars = PtrDiff(pEnd, pSrc);
1078                 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1079
1080                 // don't fall into the fast decoding loop if we don't have enough characters
1081                 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1082                 if (availableChars <= 13)
1083                 {
1084                     // we are hoping for 1 byte per char
1085                     if (availableBytes < availableChars)
1086                     {
1087                         // not enough output room.  no pending bits at this point
1088                         ch = 0;
1089                         continue;
1090                     }
1091
1092                     // try to get over the remainder of the ascii characters fast though
1093                     char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
1094                     while (pSrc < pLocalEnd)
1095                     {
1096                         ch = *pSrc;
1097                         pSrc++;
1098
1099                         // Not ASCII, need more than 1 byte per char
1100                         if (ch > 0x7F)
1101                             goto ProcessChar;
1102
1103                         *pTarget = (byte)ch;
1104                         pTarget++;
1105                     }
1106                     // we are done, let ch be 0 to clear encoder
1107                     ch = 0;
1108                     break;
1109                 }
1110
1111                 // we need at least 1 byte per character, but Convert might allow us to convert
1112                 // only part of the input, so try as much as we can.  Reduce charCount if necessary
1113                 if (availableBytes < availableChars)
1114                 {
1115                     availableChars = availableBytes;
1116                 }
1117
1118                 // FASTLOOP:
1119                 // - optimistic range checks
1120                 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1121
1122                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1123                 //  the boundary will be decreased for every non-ASCII character we encounter
1124                 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1125                 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1126                 char* pStop = pSrc + availableChars - 5;
1127
1128                 while (pSrc < pStop)
1129                 {
1130                     ch = *pSrc;
1131                     pSrc++;
1132
1133                     if (ch > 0x7F)
1134                     {
1135                         goto LongCode;
1136                     }
1137                     *pTarget = (byte)ch;
1138                     pTarget++;
1139
1140                     // get pSrc aligned
1141                     if ((unchecked((int)pSrc) & 0x2) != 0)
1142                     {
1143                         ch = *pSrc;
1144                         pSrc++;
1145                         if (ch > 0x7F)
1146                         {
1147                             goto LongCode;
1148                         }
1149                         *pTarget = (byte)ch;
1150                         pTarget++;
1151                     }
1152
1153                     // Run 4 characters at a time!
1154                     while (pSrc < pStop)
1155                     {
1156                         ch = *(int*)pSrc;
1157                         int chc = *(int*)(pSrc + 2);
1158                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
1159                         {
1160                             goto LongCodeWithMask;
1161                         }
1162
1163                         // Unfortunately, this is endianess sensitive
1164 #if BIGENDIAN
1165                         *pTarget = (byte)(ch>>16);
1166                         *(pTarget+1) = (byte)ch;
1167                         pSrc += 4;
1168                         *(pTarget+2) = (byte)(chc>>16);
1169                         *(pTarget+3) = (byte)chc;
1170                         pTarget += 4;
1171 #else // BIGENDIAN
1172                         *pTarget = (byte)ch;
1173                         *(pTarget + 1) = (byte)(ch >> 16);
1174                         pSrc += 4;
1175                         *(pTarget + 2) = (byte)chc;
1176                         *(pTarget + 3) = (byte)(chc >> 16);
1177                         pTarget += 4;
1178 #endif // BIGENDIAN
1179                     }
1180                     continue;
1181
1182                 LongCodeWithMask:
1183 #if BIGENDIAN
1184                     // be careful about the sign extension
1185                     ch = (int)(((uint)ch) >> 16);
1186 #else // BIGENDIAN
1187                     ch = (char)ch;
1188 #endif // BIGENDIAN
1189                     pSrc++;
1190
1191                     if (ch > 0x7F)
1192                     {
1193                         goto LongCode;
1194                     }
1195                     *pTarget = (byte)ch;
1196                     pTarget++;
1197                     continue;
1198
1199                 LongCode:
1200                     // use separate helper variables for slow and fast loop so that the jit optimizations
1201                     // won't get confused about the variable lifetimes
1202                     int chd;
1203                     if (ch <= 0x7FF)
1204                     {
1205                         // 2 byte encoding
1206                         chd = unchecked((sbyte)0xC0) | (ch >> 6);
1207                     }
1208                     else
1209                     {
1210                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1211                         if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1212                         {
1213                             // 3 byte encoding
1214                             chd = unchecked((sbyte)0xE0) | (ch >> 12);
1215                         }
1216                         else
1217                         {
1218                             // 4 byte encoding - high surrogate + low surrogate
1219                             // if (!IsHighSurrogate(ch))
1220                             if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
1221                             {
1222                                 // low without high -> bad, try again in slow loop
1223                                 pSrc -= 1;
1224                                 break;
1225                             }
1226
1227                             chd = *pSrc;
1228                             pSrc++;
1229
1230                             // if (!IsLowSurrogate(chd)) {
1231                             if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
1232                             {
1233                                 // high not followed by low -> bad, try again in slow loop
1234                                 pSrc -= 2;
1235                                 break;
1236                             }
1237
1238                             ch = chd + (ch << 10) +
1239                                 (0x10000
1240                                 - CharUnicodeInfo.LOW_SURROGATE_START
1241                                 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
1242
1243                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1244                             // pStop - this byte is compensated by the second surrogate character
1245                             // 2 input chars require 4 output bytes.  2 have been anticipated already
1246                             // and 2 more will be accounted for by the 2 pStop-- calls below.
1247                             pTarget++;
1248
1249                             chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1250                         }
1251                         *pTarget = (byte)chd;
1252                         pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1253                         pTarget++;
1254
1255                         chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1256                     }
1257                     *pTarget = (byte)chd;
1258                     pStop--;                        // 2 byte sequence for 1 char so need pStop--.
1259                     pTarget++;
1260
1261                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1262                     // pStop - this byte is already included
1263                     pTarget++;
1264                 }
1265
1266                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1267
1268 #endif // FASTLOOP
1269
1270                 // no pending char at this point
1271                 ch = 0;
1272             }
1273
1274             // Do we have to set the encoder bytes?
1275             if (encoder != null)
1276             {
1277                 Debug.Assert(!encoder.MustFlush || ch == 0,
1278                     "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1279
1280                 encoder.surrogateChar = ch;
1281                 encoder._charsUsed = (int)(pSrc - chars);
1282             }
1283
1284             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1285                 baseEncoder == null || !baseEncoder._throwOnOverflow,
1286                 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1287
1288             return (int)(pTarget - bytes);
1289         }
1290
1291
1292         // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1293         // while the actual character is being built in the lower bits. They are shifted together
1294         // with the actual bits of the character.
1295
1296         // bits 30 & 31 are used for pending bits fixup
1297         private const int FinalByte = 1 << 29;
1298         private const int SupplimentarySeq = 1 << 28;
1299         private const int ThreeByteSeq = 1 << 27;
1300
1301         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1302         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1303         //
1304         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1305         // kept the same as much as possible
1306         internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1307         {
1308             Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1309             Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
1310
1311             // Initialize stuff
1312             byte* pSrc = bytes;
1313             byte* pEnd = pSrc + count;
1314
1315             // Start by assuming we have as many as count, charCount always includes the adjustment
1316             // for the character being decoded
1317             int charCount = count;
1318             int ch = 0;
1319             DecoderFallbackBuffer fallback = null;
1320
1321             if (baseDecoder != null)
1322             {
1323                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1324                 ch = decoder.bits;
1325                 charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
1326
1327                 // Shouldn't have anything in fallback buffer for GetCharCount
1328                 // (don't have to check _throwOnOverflow for count)
1329                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1330                     "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1331             }
1332
1333             for (;;)
1334             {
1335                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1336
1337                 if (pSrc >= pEnd)
1338                 {
1339                     break;
1340                 }
1341
1342                 if (ch == 0)
1343                 {
1344                     // no pending bits
1345                     goto ReadChar;
1346                 }
1347
1348                 // read next byte. The JIT optimization seems to be getting confused when
1349                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1350                 int cha = *pSrc;
1351                 pSrc++;
1352
1353                 // we are expecting to see trailing bytes like 10vvvvvv
1354                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1355                 {
1356                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1357                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1358                     pSrc--;
1359                     charCount += (ch >> 30);
1360                     goto InvalidByteSequence;
1361                 }
1362
1363                 // fold in the new byte
1364                 ch = (ch << 6) | (cha & 0x3F);
1365
1366                 if ((ch & FinalByte) == 0)
1367                 {
1368                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1369                         "[UTF8Encoding.GetChars]Invariant volation");
1370
1371                     if ((ch & SupplimentarySeq) != 0)
1372                     {
1373                         if ((ch & (FinalByte >> 6)) != 0)
1374                         {
1375                             // this is 3rd byte (of 4 byte supplementary) - nothing to do
1376                             continue;
1377                         }
1378
1379                         // 2nd byte, check for non-shortest form of supplementary char and the valid
1380                         // supplementary characters in range 0x010000 - 0x10FFFF at the same time
1381                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1382                         {
1383                             goto InvalidByteSequence;
1384                         }
1385                     }
1386                     else
1387                     {
1388                         // Must be 2nd byte of a 3-byte sequence
1389                         // check for non-shortest form of 3 byte seq
1390                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1391                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1392                         {
1393                             goto InvalidByteSequence;
1394                         }
1395                     }
1396                     continue;
1397                 }
1398
1399                 // ready to punch
1400
1401                 // adjust for surrogates in non-shortest form
1402                 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
1403                 {
1404                     charCount--;
1405                 }
1406                 goto EncodeChar;
1407
1408             InvalidByteSequence:
1409                 // this code fragment should be close to the goto referencing it
1410                 // Have to do fallback for invalid bytes
1411                 if (fallback == null)
1412                 {
1413                     if (baseDecoder == null)
1414                         fallback = this.decoderFallback.CreateFallbackBuffer();
1415                     else
1416                         fallback = baseDecoder.FallbackBuffer;
1417                     fallback.InternalInitialize(bytes, null);
1418                 }
1419                 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1420
1421                 ch = 0;
1422                 continue;
1423
1424             ReadChar:
1425                 ch = *pSrc;
1426                 pSrc++;
1427
1428             ProcessChar:
1429                 if (ch > 0x7F)
1430                 {
1431                     // If its > 0x7F, its start of a new multi-byte sequence
1432
1433                     // Long sequence, so unreserve our char.
1434                     charCount--;
1435
1436                     // bit 6 has to be non-zero for start of multibyte chars.
1437                     if ((ch & 0x40) == 0)
1438                     {
1439                         // Unexpected trail byte
1440                         goto InvalidByteSequence;
1441                     }
1442
1443                     // start a new long code
1444                     if ((ch & 0x20) != 0)
1445                     {
1446                         if ((ch & 0x10) != 0)
1447                         {
1448                             // 4 byte encoding - supplimentary character (2 surrogates)
1449
1450                             ch &= 0x0F;
1451
1452                             // check that bit 4 is zero and the valid supplimentary character
1453                             // range 0x000000 - 0x10FFFF at the same time
1454                             if (ch > 0x04)
1455                             {
1456                                 ch |= 0xf0;
1457                                 goto InvalidByteSequence;
1458                             }
1459
1460                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1461                             // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1462                             ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
1463                                   (1 << 30) |           // If it dies on next byte we'll need an extra char
1464                                   (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
1465                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1466                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1467
1468                             // Our character count will be 2 characters for these 4 bytes, so subtract another char
1469                             charCount--;
1470                         }
1471                         else
1472                         {
1473                             // 3 byte encoding
1474                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1475                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1476                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1477
1478                             // We'll expect 1 character for these 3 bytes, so subtract another char.
1479                             charCount--;
1480                         }
1481                     }
1482                     else
1483                     {
1484                         // 2 byte encoding
1485
1486                         ch &= 0x1F;
1487
1488                         // check for non-shortest form
1489                         if (ch <= 1)
1490                         {
1491                             ch |= 0xc0;
1492                             goto InvalidByteSequence;
1493                         }
1494
1495                         // Add bit flags so we'll be flagged correctly
1496                         ch |= (FinalByte >> 6);
1497                     }
1498                     continue;
1499                 }
1500
1501             EncodeChar:
1502
1503 #if FASTLOOP
1504                 int availableBytes = PtrDiff(pEnd, pSrc);
1505
1506                 // don't fall into the fast decoding loop if we don't have enough bytes
1507                 if (availableBytes <= 13)
1508                 {
1509                     // try to get over the remainder of the ascii characters fast though
1510                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
1511                     while (pSrc < pLocalEnd)
1512                     {
1513                         ch = *pSrc;
1514                         pSrc++;
1515
1516                         if (ch > 0x7F)
1517                             goto ProcessChar;
1518                     }
1519                     // we are done
1520                     ch = 0;
1521                     break;
1522                 }
1523
1524                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1525                 //  the boundary will be decreased for every non-ASCII character we encounter
1526                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1527                 byte* pStop = pSrc + availableBytes - 7;
1528
1529                 while (pSrc < pStop)
1530                 {
1531                     ch = *pSrc;
1532                     pSrc++;
1533
1534                     if (ch > 0x7F)
1535                     {
1536                         goto LongCode;
1537                     }
1538
1539                     // get pSrc 2-byte aligned
1540                     if ((unchecked((int)pSrc) & 0x1) != 0)
1541                     {
1542                         ch = *pSrc;
1543                         pSrc++;
1544                         if (ch > 0x7F)
1545                         {
1546                             goto LongCode;
1547                         }
1548                     }
1549
1550                     // get pSrc 4-byte aligned
1551                     if ((unchecked((int)pSrc) & 0x2) != 0)
1552                     {
1553                         ch = *(ushort*)pSrc;
1554                         if ((ch & 0x8080) != 0)
1555                         {
1556                             goto LongCodeWithMask16;
1557                         }
1558                         pSrc += 2;
1559                     }
1560
1561                     // Run 8 + 8 characters at a time!
1562                     while (pSrc < pStop)
1563                     {
1564                         ch = *(int*)pSrc;
1565                         int chb = *(int*)(pSrc + 4);
1566                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1567                         {
1568                             goto LongCodeWithMask32;
1569                         }
1570                         pSrc += 8;
1571
1572                         // This is a really small loop - unroll it
1573                         if (pSrc >= pStop)
1574                             break;
1575
1576                         ch = *(int*)pSrc;
1577                         chb = *(int*)(pSrc + 4);
1578                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
1579                         {
1580                             goto LongCodeWithMask32;
1581                         }
1582                         pSrc += 8;
1583                     }
1584                     break;
1585
1586 #if BIGENDIAN
1587                 LongCodeWithMask32:
1588                     // be careful about the sign extension
1589                     ch = (int)(((uint)ch) >> 16);
1590                 LongCodeWithMask16:
1591                     ch = (int)(((uint)ch) >> 8);
1592 #else // BIGENDIAN
1593                 LongCodeWithMask32:
1594                 LongCodeWithMask16:
1595                     ch &= 0xFF;
1596 #endif // BIGENDIAN
1597                     pSrc++;
1598                     if (ch <= 0x7F)
1599                     {
1600                         continue;
1601                     }
1602
1603                 LongCode:
1604                     int chc = *pSrc;
1605                     pSrc++;
1606
1607                     if (
1608                         // bit 6 has to be zero
1609                         (ch & 0x40) == 0 ||
1610                         // we are expecting to see trailing bytes like 10vvvvvv
1611                         (chc & unchecked((sbyte)0xC0)) != 0x80)
1612                     {
1613                         goto BadLongCode;
1614                     }
1615
1616                     chc &= 0x3F;
1617
1618                     // start a new long code
1619                     if ((ch & 0x20) != 0)
1620                     {
1621                         // fold the first two bytes together
1622                         chc |= (ch & 0x0F) << 6;
1623
1624                         if ((ch & 0x10) != 0)
1625                         {
1626                             // 4 byte encoding - surrogate
1627                             ch = *pSrc;
1628                             if (
1629                                 // check that bit 4 is zero, the non-shortest form of surrogate
1630                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1631                                 !InRange(chc >> 4, 0x01, 0x10) ||
1632                                 // we are expecting to see trailing bytes like 10vvvvvv
1633                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1634                             {
1635                                 goto BadLongCode;
1636                             }
1637
1638                             chc = (chc << 6) | (ch & 0x3F);
1639
1640                             ch = *(pSrc + 1);
1641                             // we are expecting to see trailing bytes like 10vvvvvv
1642                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
1643                             {
1644                                 goto BadLongCode;
1645                             }
1646                             pSrc += 2;
1647
1648                             // extra byte
1649                             charCount--;
1650                         }
1651                         else
1652                         {
1653                             // 3 byte encoding
1654                             ch = *pSrc;
1655                             if (
1656                                 // check for non-shortest form of 3 byte seq
1657                                 (chc & (0x1F << 5)) == 0 ||
1658                                 // Can't have surrogates here.
1659                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1660                                 // we are expecting to see trailing bytes like 10vvvvvv
1661                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
1662                             {
1663                                 goto BadLongCode;
1664                             }
1665                             pSrc++;
1666
1667                             // extra byte
1668                             charCount--;
1669                         }
1670                     }
1671                     else
1672                     {
1673                         // 2 byte encoding
1674
1675                         // check for non-shortest form
1676                         if ((ch & 0x1E) == 0)
1677                         {
1678                             goto BadLongCode;
1679                         }
1680                     }
1681
1682                     // extra byte
1683                     charCount--;
1684                 }
1685 #endif // FASTLOOP
1686
1687                 // no pending bits at this point
1688                 ch = 0;
1689                 continue;
1690
1691             BadLongCode:
1692                 pSrc -= 2;
1693                 ch = 0;
1694                 continue;
1695             }
1696
1697             // May have a problem if we have to flush
1698             if (ch != 0)
1699             {
1700                 // We were already adjusting for these, so need to un-adjust
1701                 charCount += (ch >> 30);
1702                 if (baseDecoder == null || baseDecoder.MustFlush)
1703                 {
1704                     // Have to do fallback for invalid bytes
1705                     if (fallback == null)
1706                     {
1707                         if (baseDecoder == null)
1708                             fallback = this.decoderFallback.CreateFallbackBuffer();
1709                         else
1710                             fallback = baseDecoder.FallbackBuffer;
1711                         fallback.InternalInitialize(bytes, null);
1712                     }
1713                     charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1714                 }
1715             }
1716
1717             // Shouldn't have anything in fallback buffer for GetCharCount
1718             // (don't have to check _throwOnOverflow for count)
1719             Debug.Assert(fallback == null || fallback.Remaining == 0,
1720                 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1721
1722             return charCount;
1723         }
1724
1725         // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
1726         //           So if we're really broken, then that could also throw an error... recursively.
1727         //           So try to make sure GetChars can at least process all uses by
1728         //           System.Resources.ResourceReader!
1729         //
1730         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1731         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1732         //
1733         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1734         // kept the same as much as possible
1735         internal override unsafe int GetChars(byte* bytes, int byteCount,
1736                                                 char* chars, int charCount, DecoderNLS baseDecoder)
1737         {
1738             Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
1739             Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
1740             Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1741             Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
1742
1743             byte* pSrc = bytes;
1744             char* pTarget = chars;
1745
1746             byte* pEnd = pSrc + byteCount;
1747             char* pAllocatedBufferEnd = pTarget + charCount;
1748
1749             int ch = 0;
1750
1751             DecoderFallbackBuffer fallback = null;
1752             byte* pSrcForFallback;
1753             char* pTargetForFallback;
1754             if (baseDecoder != null)
1755             {
1756                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1757                 ch = decoder.bits;
1758
1759                 // Shouldn't have anything in fallback buffer for GetChars
1760                 // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty)
1761                 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1762                     "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1763             }
1764
1765             for (;;)
1766             {
1767                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1768
1769                 if (pSrc >= pEnd)
1770                 {
1771                     break;
1772                 }
1773
1774                 if (ch == 0)
1775                 {
1776                     // no pending bits
1777                     goto ReadChar;
1778                 }
1779
1780                 // read next byte. The JIT optimization seems to be getting confused when
1781                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1782                 int cha = *pSrc;
1783                 pSrc++;
1784
1785                 // we are expecting to see trailing bytes like 10vvvvvv
1786                 if ((cha & unchecked((sbyte)0xC0)) != 0x80)
1787                 {
1788                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1789                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1790                     pSrc--;
1791                     goto InvalidByteSequence;
1792                 }
1793
1794                 // fold in the new byte
1795                 ch = (ch << 6) | (cha & 0x3F);
1796
1797                 if ((ch & FinalByte) == 0)
1798                 {
1799                     // Not at last byte yet
1800                     Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1801                         "[UTF8Encoding.GetChars]Invariant volation");
1802
1803                     if ((ch & SupplimentarySeq) != 0)
1804                     {
1805                         // Its a 4-byte supplimentary sequence
1806                         if ((ch & (FinalByte >> 6)) != 0)
1807                         {
1808                             // this is 3rd byte of 4 byte sequence - nothing to do
1809                             continue;
1810                         }
1811
1812                         // 2nd byte of 4 bytes
1813                         // check for non-shortest form of surrogate and the valid surrogate
1814                         // range 0x000000 - 0x10FFFF at the same time
1815                         if (!InRange(ch & 0x1F0, 0x10, 0x100))
1816                         {
1817                             goto InvalidByteSequence;
1818                         }
1819                     }
1820                     else
1821                     {
1822                         // Must be 2nd byte of a 3-byte sequence
1823                         // check for non-shortest form of 3 byte seq
1824                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1825                             (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
1826                         {
1827                             goto InvalidByteSequence;
1828                         }
1829                     }
1830                     continue;
1831                 }
1832
1833                 // ready to punch
1834
1835                 // surrogate in shortest form?
1836                 // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1837                 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
1838                 {
1839                     // let the range check for the second char throw the exception
1840                     if (pTarget < pAllocatedBufferEnd)
1841                     {
1842                         *pTarget = (char)(((ch >> 10) & 0x7FF) +
1843                             unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
1844                         pTarget++;
1845
1846                         ch = (ch & 0x3FF) +
1847                             unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1848                     }
1849                 }
1850
1851                 goto EncodeChar;
1852
1853             InvalidByteSequence:
1854                 // this code fragment should be close to the gotos referencing it
1855                 // Have to do fallback for invalid bytes
1856                 if (fallback == null)
1857                 {
1858                     if (baseDecoder == null)
1859                         fallback = this.decoderFallback.CreateFallbackBuffer();
1860                     else
1861                         fallback = baseDecoder.FallbackBuffer;
1862                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1863                 }
1864                 // That'll back us up the appropriate # of bytes if we didn't get anywhere
1865                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
1866                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
1867                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
1868                 pSrc = pSrcForFallback;
1869                 pTarget = pTargetForFallback;
1870
1871                 if (!fallbackResult)
1872                 {
1873                     // Ran out of buffer space
1874                     // Need to throw an exception?
1875                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1876                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1877                     fallback.InternalReset();
1878                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1879                     ch = 0;
1880                     break;
1881                 }
1882                 Debug.Assert(pSrc >= bytes,
1883                     "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1884                 ch = 0;
1885                 continue;
1886
1887             ReadChar:
1888                 ch = *pSrc;
1889                 pSrc++;
1890
1891             ProcessChar:
1892                 if (ch > 0x7F)
1893                 {
1894                     // If its > 0x7F, its start of a new multi-byte sequence
1895
1896                     // bit 6 has to be non-zero
1897                     if ((ch & 0x40) == 0)
1898                     {
1899                         goto InvalidByteSequence;
1900                     }
1901
1902                     // start a new long code
1903                     if ((ch & 0x20) != 0)
1904                     {
1905                         if ((ch & 0x10) != 0)
1906                         {
1907                             // 4 byte encoding - supplimentary character (2 surrogates)
1908
1909                             ch &= 0x0F;
1910
1911                             // check that bit 4 is zero and the valid supplimentary character
1912                             // range 0x000000 - 0x10FFFF at the same time
1913                             if (ch > 0x04)
1914                             {
1915                                 ch |= 0xf0;
1916                                 goto InvalidByteSequence;
1917                             }
1918
1919                             ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1920                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1921                                 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1922                         }
1923                         else
1924                         {
1925                             // 3 byte encoding
1926                             ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1927                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1928                         }
1929                     }
1930                     else
1931                     {
1932                         // 2 byte encoding
1933
1934                         ch &= 0x1F;
1935
1936                         // check for non-shortest form
1937                         if (ch <= 1)
1938                         {
1939                             ch |= 0xc0;
1940                             goto InvalidByteSequence;
1941                         }
1942
1943                         ch |= (FinalByte >> 6);
1944                     }
1945                     continue;
1946                 }
1947
1948             EncodeChar:
1949                 // write the pending character
1950                 if (pTarget >= pAllocatedBufferEnd)
1951                 {
1952                     // Fix chars so we make sure to throw if we didn't output anything
1953                     ch &= 0x1fffff;
1954                     if (ch > 0x7f)
1955                     {
1956                         if (ch > 0x7ff)
1957                         {
1958                             if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1959                                 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1960                             {
1961                                 pSrc--;     // It was 4 bytes
1962                                 pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
1963                             }
1964                             else if (ch > 0xffff)
1965                             {
1966                                 pSrc--;     // It was 4 bytes, nothing was stored
1967                             }
1968                             pSrc--;         // It was at least 3 bytes
1969                         }
1970                         pSrc--;             // It was at least 2 bytes
1971                     }
1972                     pSrc--;
1973
1974                     // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1975                     // a 4 byte sequence already)
1976                     Debug.Assert(pSrc >= bytes || pTarget == chars,
1977                         "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1978                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1979
1980                     // Don't store ch in decoder, we already backed up to its start
1981                     ch = 0;
1982
1983                     // Didn't throw, just use this buffer size.
1984                     break;
1985                 }
1986                 *pTarget = (char)ch;
1987                 pTarget++;
1988
1989 #if FASTLOOP
1990                 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1991                 int availableBytes = PtrDiff(pEnd, pSrc);
1992
1993                 // don't fall into the fast decoding loop if we don't have enough bytes
1994                 // Test for availableChars is done because pStop would be <= pTarget.
1995                 if (availableBytes <= 13)
1996                 {
1997                     // we may need as many as 1 character per byte
1998                     if (availableChars < availableBytes)
1999                     {
2000                         // not enough output room.  no pending bits at this point
2001                         ch = 0;
2002                         continue;
2003                     }
2004
2005                     // try to get over the remainder of the ascii characters fast though
2006                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2007                     while (pSrc < pLocalEnd)
2008                     {
2009                         ch = *pSrc;
2010                         pSrc++;
2011
2012                         if (ch > 0x7F)
2013                             goto ProcessChar;
2014
2015                         *pTarget = (char)ch;
2016                         pTarget++;
2017                     }
2018                     // we are done
2019                     ch = 0;
2020                     break;
2021                 }
2022
2023                 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
2024                 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
2025                 if (availableChars < availableBytes)
2026                 {
2027                     availableBytes = availableChars;
2028                 }
2029
2030                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2031                 //  the boundary will be decreased for every non-ASCII character we encounter
2032                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
2033                 char* pStop = pTarget + availableBytes - 7;
2034
2035                 while (pTarget < pStop)
2036                 {
2037                     ch = *pSrc;
2038                     pSrc++;
2039
2040                     if (ch > 0x7F)
2041                     {
2042                         goto LongCode;
2043                     }
2044                     *pTarget = (char)ch;
2045                     pTarget++;
2046
2047                     // get pSrc to be 2-byte aligned
2048                     if ((unchecked((int)pSrc) & 0x1) != 0)
2049                     {
2050                         ch = *pSrc;
2051                         pSrc++;
2052                         if (ch > 0x7F)
2053                         {
2054                             goto LongCode;
2055                         }
2056                         *pTarget = (char)ch;
2057                         pTarget++;
2058                     }
2059
2060                     // get pSrc to be 4-byte aligned
2061                     if ((unchecked((int)pSrc) & 0x2) != 0)
2062                     {
2063                         ch = *(ushort*)pSrc;
2064                         if ((ch & 0x8080) != 0)
2065                         {
2066                             goto LongCodeWithMask16;
2067                         }
2068
2069                         // Unfortunately, this is endianess sensitive
2070 #if BIGENDIAN
2071                         *pTarget = (char)((ch >> 8) & 0x7F);
2072                         pSrc += 2;
2073                         *(pTarget+1) = (char)(ch & 0x7F);
2074                         pTarget += 2;
2075 #else // BIGENDIAN
2076                         *pTarget = (char)(ch & 0x7F);
2077                         pSrc += 2;
2078                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2079                         pTarget += 2;
2080 #endif // BIGENDIAN
2081                     }
2082
2083                     // Run 8 characters at a time!
2084                     while (pTarget < pStop)
2085                     {
2086                         ch = *(int*)pSrc;
2087                         int chb = *(int*)(pSrc + 4);
2088                         if (((ch | chb) & unchecked((int)0x80808080)) != 0)
2089                         {
2090                             goto LongCodeWithMask32;
2091                         }
2092
2093                         // Unfortunately, this is endianess sensitive
2094 #if BIGENDIAN
2095                         *pTarget = (char)((ch >> 24) & 0x7F);
2096                         *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2097                         *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2098                         *(pTarget+3) = (char)(ch & 0x7F);
2099                         pSrc += 8;
2100                         *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2101                         *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2102                         *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2103                         *(pTarget+7) = (char)(chb & 0x7F);
2104                         pTarget += 8;
2105 #else // BIGENDIAN
2106                         *pTarget = (char)(ch & 0x7F);
2107                         *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
2108                         *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
2109                         *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
2110                         pSrc += 8;
2111                         *(pTarget + 4) = (char)(chb & 0x7F);
2112                         *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
2113                         *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
2114                         *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
2115                         pTarget += 8;
2116 #endif // BIGENDIAN
2117                     }
2118                     break;
2119
2120 #if BIGENDIAN
2121                 LongCodeWithMask32:
2122                     // be careful about the sign extension
2123                     ch = (int)(((uint)ch) >> 16);
2124                 LongCodeWithMask16:
2125                     ch = (int)(((uint)ch) >> 8);
2126 #else // BIGENDIAN
2127                 LongCodeWithMask32:
2128                 LongCodeWithMask16:
2129                     ch &= 0xFF;
2130 #endif // BIGENDIAN
2131                     pSrc++;
2132                     if (ch <= 0x7F)
2133                     {
2134                         *pTarget = (char)ch;
2135                         pTarget++;
2136                         continue;
2137                     }
2138
2139                 LongCode:
2140                     int chc = *pSrc;
2141                     pSrc++;
2142
2143                     if (
2144                         // bit 6 has to be zero
2145                         (ch & 0x40) == 0 ||
2146                         // we are expecting to see trailing bytes like 10vvvvvv
2147                         (chc & unchecked((sbyte)0xC0)) != 0x80)
2148                     {
2149                         goto BadLongCode;
2150                     }
2151
2152                     chc &= 0x3F;
2153
2154                     // start a new long code
2155                     if ((ch & 0x20) != 0)
2156                     {
2157                         // fold the first two bytes together
2158                         chc |= (ch & 0x0F) << 6;
2159
2160                         if ((ch & 0x10) != 0)
2161                         {
2162                             // 4 byte encoding - surrogate
2163                             ch = *pSrc;
2164                             if (
2165                                 // check that bit 4 is zero, the non-shortest form of surrogate
2166                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2167                                 !InRange(chc >> 4, 0x01, 0x10) ||
2168                                 // we are expecting to see trailing bytes like 10vvvvvv
2169                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2170                             {
2171                                 goto BadLongCode;
2172                             }
2173
2174                             chc = (chc << 6) | (ch & 0x3F);
2175
2176                             ch = *(pSrc + 1);
2177                             // we are expecting to see trailing bytes like 10vvvvvv
2178                             if ((ch & unchecked((sbyte)0xC0)) != 0x80)
2179                             {
2180                                 goto BadLongCode;
2181                             }
2182                             pSrc += 2;
2183
2184                             ch = (chc << 6) | (ch & 0x3F);
2185
2186                             *pTarget = (char)(((ch >> 10) & 0x7FF) +
2187                                 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
2188                             pTarget++;
2189
2190                             ch = (ch & 0x3FF) +
2191                                 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2192
2193                             // extra byte, we're already planning 2 chars for 2 of these bytes,
2194                             // but the big loop is testing the target against pStop, so we need
2195                             // to subtract 2 more or we risk overrunning the input.  Subtract
2196                             // one here and one below.
2197                             pStop--;
2198                         }
2199                         else
2200                         {
2201                             // 3 byte encoding
2202                             ch = *pSrc;
2203                             if (
2204                                 // check for non-shortest form of 3 byte seq
2205                                 (chc & (0x1F << 5)) == 0 ||
2206                                 // Can't have surrogates here.
2207                                 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2208                                 // we are expecting to see trailing bytes like 10vvvvvv
2209                                 (ch & unchecked((sbyte)0xC0)) != 0x80)
2210                             {
2211                                 goto BadLongCode;
2212                             }
2213                             pSrc++;
2214
2215                             ch = (chc << 6) | (ch & 0x3F);
2216
2217                             // extra byte, we're only expecting 1 char for each of these 3 bytes,
2218                             // but the loop is testing the target (not source) against pStop, so
2219                             // we need to subtract 2 more or we risk overrunning the input.
2220                             // Subtract 1 here and one more below
2221                             pStop--;
2222                         }
2223                     }
2224                     else
2225                     {
2226                         // 2 byte encoding
2227
2228                         ch &= 0x1F;
2229
2230                         // check for non-shortest form
2231                         if (ch <= 1)
2232                         {
2233                             goto BadLongCode;
2234                         }
2235                         ch = (ch << 6) | chc;
2236                     }
2237
2238                     *pTarget = (char)ch;
2239                     pTarget++;
2240
2241                     // extra byte, we're only expecting 1 char for each of these 2 bytes,
2242                     // but the loop is testing the target (not source) against pStop.
2243                     // subtract an extra count from pStop so that we don't overrun the input.
2244                     pStop--;
2245                 }
2246 #endif // FASTLOOP
2247
2248                 Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2249
2250                 // no pending bits at this point
2251                 ch = 0;
2252                 continue;
2253
2254             BadLongCode:
2255                 pSrc -= 2;
2256                 ch = 0;
2257                 continue;
2258             }
2259
2260             if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2261             {
2262                 // Have to do fallback for invalid bytes
2263                 if (fallback == null)
2264                 {
2265                     if (baseDecoder == null)
2266                         fallback = this.decoderFallback.CreateFallbackBuffer();
2267                     else
2268                         fallback = baseDecoder.FallbackBuffer;
2269                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2270                 }
2271
2272                 // That'll back us up the appropriate # of bytes if we didn't get anywhere
2273                 pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
2274                 pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
2275                 bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
2276                 pSrc = pSrcForFallback;
2277                 pTarget = pTargetForFallback;
2278
2279                 if (!fallbackResult)
2280                 {
2281                     Debug.Assert(pSrc >= bytes || pTarget == chars,
2282                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2283
2284                     // Ran out of buffer space
2285                     // Need to throw an exception?
2286                     fallback.InternalReset();
2287                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
2288                 }
2289                 Debug.Assert(pSrc >= bytes,
2290                     "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2291                 ch = 0;
2292             }
2293
2294             if (baseDecoder != null)
2295             {
2296                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2297
2298                 // If we're storing flush data we expect all bits to be used or else
2299                 // we're stuck in the middle of a conversion
2300                 Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow,
2301                     "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2302
2303                 // Remember our leftover bits.
2304                 decoder.bits = ch;
2305
2306                 baseDecoder._bytesUsed = (int)(pSrc - bytes);
2307             }
2308
2309             // Shouldn't have anything in fallback buffer for GetChars
2310             // (don't have to check _throwOnOverflow for chars)
2311             Debug.Assert(fallback == null || fallback.Remaining == 0,
2312                 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2313
2314             return PtrDiff(pTarget, chars);
2315         }
2316
2317         // During GetChars we had an invalid byte sequence
2318         // pSrc is backed up to the start of the bad sequence if we didn't have room to
2319         // fall it back.  Otherwise pSrc remains where it is.
2320         private unsafe bool FallbackInvalidByteSequence(
2321             ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2322         {
2323             // Get our byte[]
2324             byte* pStart = pSrc;
2325             byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2326
2327             // Do the actual fallback
2328             if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2329             {
2330                 // Oops, it failed, back up to pStart
2331                 pSrc = pStart;
2332                 return false;
2333             }
2334
2335             // It worked
2336             return true;
2337         }
2338
2339         // During GetCharCount we had an invalid byte sequence
2340         // pSrc is used to find the index that points to the invalid bytes,
2341         // however the byte[] contains the fallback bytes (in case the index is -1)
2342         private unsafe int FallbackInvalidByteSequence(
2343             byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2344         {
2345             // Get our byte[]
2346             byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2347
2348             // Do the actual fallback
2349             int count = fallback.InternalFallback(bytesUnknown, pSrc);
2350
2351             // # of fallback chars expected.
2352             // Note that we only get here for "long" sequences, and have already unreserved
2353             // the count that we prereserved for the input bytes
2354             return count;
2355         }
2356
2357         // Note that some of these bytes may have come from a previous fallback, so we cannot
2358         // just decrement the pointer and use the values we read.  In those cases we have
2359         // to regenerate the original values.
2360         private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2361         {
2362             // Get our byte[]
2363             byte[] bytesUnknown = null;
2364
2365             // See if it was a plain char
2366             // (have to check >= 0 because we have all sorts of wierd bit flags)
2367             if (ch < 0x100 && ch >= 0)
2368             {
2369                 pSrc--;
2370                 bytesUnknown = new byte[] { unchecked((byte)ch) };
2371             }
2372             // See if its an unfinished 2 byte sequence
2373             else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2374             {
2375                 pSrc--;
2376                 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
2377             }
2378             // So now we're either 2nd byte of 3 or 4 byte sequence or
2379             // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2380             // 1st check if its a 4 byte sequence
2381             else if ((ch & SupplimentarySeq) != 0)
2382             {
2383                 //  3rd byte of 4 byte sequence?
2384                 if ((ch & (FinalByte >> 6)) != 0)
2385                 {
2386                     // 3rd byte of 4 byte sequence
2387                     pSrc -= 3;
2388                     bytesUnknown = new byte[] {
2389                         unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2390                         unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2391                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2392                 }
2393                 else if ((ch & (FinalByte >> 12)) != 0)
2394                 {
2395                     // 2nd byte of a 4 byte sequence
2396                     pSrc -= 2;
2397                     bytesUnknown = new byte[] {
2398                         unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2399                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2400                 }
2401                 else
2402                 {
2403                     // 4th byte of a 4 byte sequence
2404                     pSrc--;
2405                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
2406                 }
2407             }
2408             else
2409             {
2410                 // 2nd byte of 3 byte sequence?
2411                 if ((ch & (FinalByte >> 6)) != 0)
2412                 {
2413                     // So its 2nd byte of a 3 byte sequence
2414                     pSrc -= 2;
2415                     bytesUnknown = new byte[] {
2416                         unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2417                 }
2418                 else
2419                 {
2420                     // 1st byte of a 3 byte sequence
2421                     pSrc--;
2422                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
2423                 }
2424             }
2425
2426             return bytesUnknown;
2427         }
2428
2429
2430         public override Decoder GetDecoder()
2431         {
2432             return new UTF8Decoder(this);
2433         }
2434
2435
2436         public override Encoder GetEncoder()
2437         {
2438             return new UTF8Encoder(this);
2439         }
2440
2441
2442         public override int GetMaxByteCount(int charCount)
2443         {
2444             if (charCount < 0)
2445                 throw new ArgumentOutOfRangeException(nameof(charCount),
2446                      SR.ArgumentOutOfRange_NeedNonNegNum);
2447             Contract.EndContractBlock();
2448
2449             // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2450             long byteCount = (long)charCount + 1;
2451
2452             if (EncoderFallback.MaxCharCount > 1)
2453                 byteCount *= EncoderFallback.MaxCharCount;
2454
2455             // Max 3 bytes per char.  (4 bytes per 2 chars for surrogates)
2456             byteCount *= 3;
2457
2458             if (byteCount > 0x7fffffff)
2459                 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
2460
2461             return (int)byteCount;
2462         }
2463
2464
2465         public override int GetMaxCharCount(int byteCount)
2466         {
2467             if (byteCount < 0)
2468                 throw new ArgumentOutOfRangeException(nameof(byteCount),
2469                      SR.ArgumentOutOfRange_NeedNonNegNum);
2470             Contract.EndContractBlock();
2471
2472             // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2473             long charCount = ((long)byteCount + 1);
2474
2475             // Non-shortest form would fall back, so get max count from fallback.
2476             // So would 11... followed by 11..., so you could fall back every byte
2477             if (DecoderFallback.MaxCharCount > 1)
2478             {
2479                 charCount *= DecoderFallback.MaxCharCount;
2480             }
2481
2482             if (charCount > 0x7fffffff)
2483                 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
2484
2485             return (int)charCount;
2486         }
2487
2488
2489         public override byte[] GetPreamble()
2490         {
2491             if (_emitUTF8Identifier)
2492             {
2493                 // Allocate new array to prevent users from modifying it.
2494                 return new byte[3] { 0xEF, 0xBB, 0xBF };
2495             }
2496             else
2497                 return Array.Empty<byte>();
2498         }
2499
2500
2501         public override bool Equals(Object value)
2502         {
2503             UTF8Encoding that = value as UTF8Encoding;
2504             if (that != null)
2505             {
2506                 return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
2507                        (EncoderFallback.Equals(that.EncoderFallback)) &&
2508                        (DecoderFallback.Equals(that.DecoderFallback));
2509             }
2510             return (false);
2511         }
2512
2513
2514         public override int GetHashCode()
2515         {
2516             //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2517             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2518                    UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
2519         }
2520
2521         private sealed class UTF8Encoder : EncoderNLS
2522         {
2523             // We must save a high surrogate value until the next call, looking
2524             // for a low surrogate value.
2525             internal int surrogateChar;
2526
2527             public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2528             {
2529                 // base calls reset
2530             }
2531
2532             public override void Reset()
2533
2534             {
2535                 this.surrogateChar = 0;
2536                 if (_fallbackBuffer != null)
2537                     _fallbackBuffer.Reset();
2538             }
2539
2540             // Anything left in our encoder?
2541             internal override bool HasState
2542             {
2543                 get
2544                 {
2545                     return (this.surrogateChar != 0);
2546                 }
2547             }
2548         }
2549
2550         private sealed class UTF8Decoder : DecoderNLS
2551         {
2552             // We'll need to remember the previous information. See the comments around definition
2553             // of FinalByte for details.
2554             internal int bits;
2555
2556             public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2557             {
2558                 // base calls reset
2559             }
2560
2561             public override void Reset()
2562             {
2563                 this.bits = 0;
2564                 if (_fallbackBuffer != null)
2565                     _fallbackBuffer.Reset();
2566             }
2567
2568             // Anything left in our decoder?
2569             internal override bool HasState
2570             {
2571                 get
2572                 {
2573                     return (this.bits != 0);
2574                 }
2575             }
2576         }
2577     }
2578 }