1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
10 using System.Globalization;
11 using System.Diagnostics;
12 using System.Diagnostics.Contracts;
16 public class UnicodeEncoding : Encoding
18 // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
19 // The initialization code will not be run until a static member of the class is referenced
20 internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true);
21 internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true);
23 internal bool isThrowException = false;
25 internal bool bigEndian = false;
26 internal bool byteOrderMark = true;
28 // Unicode version 2.0 character size in bytes
29 public const int CharSize = 2;
32 public UnicodeEncoding()
38 public UnicodeEncoding(bool bigEndian, bool byteOrderMark)
39 : this(bigEndian, byteOrderMark, false)
44 public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
45 : base(bigEndian ? 1201 : 1200) //Set the data item.
47 this.isThrowException = throwOnInvalidBytes;
48 this.bigEndian = bigEndian;
49 this.byteOrderMark = byteOrderMark;
51 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
52 if (this.isThrowException)
53 SetDefaultFallbacks();
56 internal override void SetDefaultFallbacks()
58 // For UTF-X encodings, we use a replacement fallback with an empty string
59 if (this.isThrowException)
61 this.encoderFallback = EncoderFallback.ExceptionFallback;
62 this.decoderFallback = DecoderFallback.ExceptionFallback;
66 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
67 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
71 // The following methods are copied from EncodingNLS.cs.
72 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
73 // These should be kept in sync for the following classes:
74 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
77 // Returns the number of bytes required to encode a range of characters in
80 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
81 // So if you fix this, fix the others. Currently those include:
82 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
83 // parent method is safe
85 public override unsafe int GetByteCount(char[] chars, int index, int count)
87 // Validate input parameters
89 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
91 if (index < 0 || count < 0)
92 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
94 if (chars.Length - index < count)
95 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
96 Contract.EndContractBlock();
98 // If no input, return 0, avoid fixed empty array problem
102 // Just call the pointer version
103 fixed (char* pChars = chars)
104 return GetByteCount(pChars + index, count, null);
107 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
108 // So if you fix this, fix the others. Currently those include:
109 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
110 // parent method is safe
112 public override unsafe int GetByteCount(String s)
116 throw new ArgumentNullException("s");
117 Contract.EndContractBlock();
119 fixed (char* pChars = s)
120 return GetByteCount(pChars, s.Length, null);
123 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
124 // So if you fix this, fix the others. Currently those include:
125 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
127 [CLSCompliant(false)]
128 public override unsafe int GetByteCount(char* chars, int count)
130 // Validate Parameters
132 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
135 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
136 Contract.EndContractBlock();
138 // Call it with empty encoder
139 return GetByteCount(chars, count, null);
142 // Parent method is safe.
143 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
144 // So if you fix this, fix the others. Currently those include:
145 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
147 public override unsafe int GetBytes(String s, int charIndex, int charCount,
148 byte[] bytes, int byteIndex)
150 if (s == null || bytes == null)
151 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
153 if (charIndex < 0 || charCount < 0)
154 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
156 if (s.Length - charIndex < charCount)
157 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
159 if (byteIndex < 0 || byteIndex > bytes.Length)
160 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
161 Contract.EndContractBlock();
163 int byteCount = bytes.Length - byteIndex;
165 // Fixed doesn't like 0 length arrays.
166 if (bytes.Length == 0)
169 fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
170 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
173 // Encodes a range of characters in a character array into a range of bytes
174 // in a byte array. An exception occurs if the byte array is not large
175 // enough to hold the complete encoding of the characters. The
176 // GetByteCount method can be used to determine the exact number of
177 // bytes that will be produced for a given range of characters.
178 // Alternatively, the GetMaxByteCount method can be used to
179 // determine the maximum number of bytes that will be produced for a given
180 // number of characters, regardless of the actual character values.
182 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
183 // So if you fix this, fix the others. Currently those include:
184 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
185 // parent method is safe
187 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
188 byte[] bytes, int byteIndex)
190 // Validate parameters
191 if (chars == null || bytes == null)
192 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
194 if (charIndex < 0 || charCount < 0)
195 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
197 if (chars.Length - charIndex < charCount)
198 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
200 if (byteIndex < 0 || byteIndex > bytes.Length)
201 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
202 Contract.EndContractBlock();
204 // If nothing to encode return 0, avoid fixed problem
208 // Just call pointer version
209 int byteCount = bytes.Length - byteIndex;
211 // Fixed doesn't like 0 length arrays.
212 if (bytes.Length == 0)
215 fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
216 // Remember that byteCount is # to decode, not size of array.
217 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
220 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
221 // So if you fix this, fix the others. Currently those include:
222 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
224 [CLSCompliant(false)]
225 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
227 // Validate Parameters
228 if (bytes == null || chars == null)
229 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
231 if (charCount < 0 || byteCount < 0)
232 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
233 Contract.EndContractBlock();
235 return GetBytes(chars, charCount, bytes, byteCount, null);
238 // Returns the number of characters produced by decoding a range of bytes
241 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
242 // So if you fix this, fix the others. Currently those include:
243 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
244 // parent method is safe
246 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
248 // Validate Parameters
250 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
252 if (index < 0 || count < 0)
253 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
255 if (bytes.Length - index < count)
256 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
257 Contract.EndContractBlock();
259 // If no input just return 0, fixed doesn't like 0 length arrays
263 // Just call pointer version
264 fixed (byte* pBytes = bytes)
265 return GetCharCount(pBytes + index, count, null);
268 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
269 // So if you fix this, fix the others. Currently those include:
270 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
272 [CLSCompliant(false)]
273 public override unsafe int GetCharCount(byte* bytes, int count)
275 // Validate Parameters
277 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
280 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
281 Contract.EndContractBlock();
283 return GetCharCount(bytes, count, null);
286 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
287 // So if you fix this, fix the others. Currently those include:
288 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
289 // parent method is safe
291 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
292 char[] chars, int charIndex)
294 // Validate Parameters
295 if (bytes == null || chars == null)
296 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
298 if (byteIndex < 0 || byteCount < 0)
299 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
301 if ( bytes.Length - byteIndex < byteCount)
302 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
304 if (charIndex < 0 || charIndex > chars.Length)
305 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
306 Contract.EndContractBlock();
308 // If no input, return 0 & avoid fixed problem
312 // Just call pointer version
313 int charCount = chars.Length - charIndex;
315 // Fixed doesn't like 0 length arrays.
316 if (chars.Length == 0)
319 fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
320 // Remember that charCount is # to decode, not size of array
321 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
324 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
325 // So if you fix this, fix the others. Currently those include:
326 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
328 [CLSCompliant(false)]
329 public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
331 // Validate Parameters
332 if (bytes == null || chars == null)
333 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
335 if (charCount < 0 || byteCount < 0)
336 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
337 Contract.EndContractBlock();
339 return GetChars(bytes, byteCount, chars, charCount, null);
342 // Returns a string containing the decoded representation of a range of
343 // bytes in a byte array.
345 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
346 // So if you fix this, fix the others. Currently those include:
347 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
348 // parent method is safe
350 public override unsafe string GetString(byte[] bytes, int index, int count)
352 // Validate Parameters
354 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
356 if (index < 0 || count < 0)
357 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
359 if (bytes.Length - index < count)
360 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
361 Contract.EndContractBlock();
363 // Avoid problems with empty input buffer
364 if (count == 0) return String.Empty;
366 fixed (byte* pBytes = bytes)
367 return String.CreateStringFromEncoding(
368 pBytes + index, count, this);
372 // End of standard methods copied from EncodingNLS.cs
375 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder)
377 Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null");
378 Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0");
380 // Start by assuming each char gets 2 bytes
381 int byteCount = count << 1;
383 // Check for overflow in byteCount
384 // (If they were all invalid chars, this would actually be wrong,
385 // but that's a ridiculously large # so we're not concerned about that case)
387 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
389 char* charStart = chars;
390 char* charEnd = chars + count;
391 char charLeftOver = (char)0;
393 bool wasHereBefore = false;
395 // Need -1 to check 2 at a time. If we have an even #, longChars will go
396 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
397 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
398 ulong* longEnd = (ulong*)(charEnd - 3);
400 // For fallback we may need a fallback buffer
401 EncoderFallbackBuffer fallbackBuffer = null;
402 char* charsForFallback;
406 charLeftOver = encoder._charLeftOver;
408 // Assume extra bytes to encode charLeftOver if it existed
409 if (charLeftOver > 0)
412 // We mustn't have left over fallback data when counting
413 if (encoder.InternalHasFallbackBuffer)
415 fallbackBuffer = encoder.FallbackBuffer;
416 if (fallbackBuffer.Remaining > 0)
417 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
419 // Set our internal fallback interesting things.
420 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
427 while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
429 // First unwind any fallback
432 // No fallback, maybe we can do it fast
433 #if !NO_FAST_UNICODE_LOOP
434 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
440 #if BIT64 // 64 bit CPU needs to be long aligned for this to work.
441 charLeftOver == 0 && (unchecked((long)chars) & 7) == 0)
443 charLeftOver == 0 && (unchecked((int)chars) & 3) == 0)
446 // Need new char* so we can check 4 at a time
447 ulong* longChars = (ulong*)chars;
449 while (longChars < longEnd)
451 // See if we potentially have surrogates (0x8000 bit set)
452 // (We're either big endian on a big endian machine or little endian on
453 // a little endian machine so that'll work)
454 if ((0x8000800080008000 & *longChars) != 0)
456 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
457 // 5 bits looks like 11011, then its a high or low surrogate.
458 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
459 // Note that we expect BMP characters to be more common than surrogates
460 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
461 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
463 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
464 // but no clue if they're high or low.
465 // If each of the 4 characters are non-zero, then none are surrogates.
466 if ((uTemp & 0xFFFF000000000000) == 0 ||
467 (uTemp & 0x0000FFFF00000000) == 0 ||
468 (uTemp & 0x00000000FFFF0000) == 0 ||
469 (uTemp & 0x000000000000FFFF) == 0)
471 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
472 // or if there's 1 or 4 surrogates
474 // If they happen to be high/low/high/low, we may as well continue. Check the next
475 // bit to see if its set (low) or not (high) in the right pattern
477 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
479 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
482 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
483 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
485 // Drop out to the slow loop to resolve the surrogates
488 // else they are all surrogates in High/Low/High/Low order, so we can use them.
490 // else none are surrogates, so we can use them.
492 // else all < 0x8000 so we can use them
494 // We already counted these four chars, go to next long.
498 chars = (char*)longChars;
500 if (chars >= charEnd)
503 #endif // !NO_FAST_UNICODE_LOOP
505 // No fallback, just get next char
511 // We weren't preallocating fallback space.
515 // Check for high or low surrogates
516 if (ch >= 0xd800 && ch <= 0xdfff)
518 // Was it a high surrogate?
521 // Its a high surrogate, if we already had a high surrogate do its fallback
522 if (charLeftOver > 0)
524 // Unwind the current character, this should be safe because we
525 // don't have leftover data in the fallback, so chars must have
527 Debug.Assert(chars > charStart,
528 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
531 // If previous high surrogate deallocate 2 bytes
534 // Fallback the previous surrogate
535 // Need to initialize fallback buffer?
536 if (fallbackBuffer == null)
539 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
541 fallbackBuffer = encoder.FallbackBuffer;
543 // Set our internal fallback interesting things.
544 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
547 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
548 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
549 chars = charsForFallback;
551 // Now no high surrogate left over
552 charLeftOver = (char)0;
556 // Remember this high surrogate
562 // Its a low surrogate
563 if (charLeftOver == 0)
565 // Expected a previous high surrogate.
566 // Don't count this one (we'll count its fallback if necessary)
570 // Need to initialize fallback buffer?
571 if (fallbackBuffer == null)
574 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
576 fallbackBuffer = encoder.FallbackBuffer;
578 // Set our internal fallback interesting things.
579 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
581 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
582 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
583 chars = charsForFallback;
587 // Valid surrogate pair, add our charLeftOver
588 charLeftOver = (char)0;
591 else if (charLeftOver > 0)
593 // Expected a low surrogate, but this char is normal
595 // Rewind the current character, fallback previous character.
596 // this should be safe because we don't have leftover data in the
597 // fallback, so chars must have advanced already.
598 Debug.Assert(chars > charStart,
599 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
602 // fallback previous chars
603 // Need to initialize fallback buffer?
604 if (fallbackBuffer == null)
607 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
609 fallbackBuffer = encoder.FallbackBuffer;
611 // Set our internal fallback interesting things.
612 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
614 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
615 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
616 chars = charsForFallback;
618 // Ignore charLeftOver or throw
620 charLeftOver = (char)0;
625 // Ok we had something to add (already counted)
628 // Don't allocate space for left over char
629 if (charLeftOver > 0)
633 // If we have to flush, stick it in fallback and try again
634 if (encoder == null || encoder.MustFlush)
638 // Throw it, using our complete character
639 throw new ArgumentException(
640 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
644 // Need to initialize fallback buffer?
645 if (fallbackBuffer == null)
648 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
650 fallbackBuffer = encoder.FallbackBuffer;
652 // Set our internal fallback interesting things.
653 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
655 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
656 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
657 chars = charsForFallback;
658 charLeftOver = (char)0;
659 wasHereBefore = true;
665 // Shouldn't have anything in fallback buffer for GetByteCount
666 // (don't have to check _throwOnOverflow for count)
667 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
668 "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
670 // Don't remember fallbackBuffer.encoder for counting
674 internal override unsafe int GetBytes(char* chars, int charCount,
675 byte* bytes, int byteCount, EncoderNLS encoder)
677 Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null");
678 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
679 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
680 Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null");
682 char charLeftOver = (char)0;
684 bool wasHereBefore = false;
687 byte* byteEnd = bytes + byteCount;
688 char* charEnd = chars + charCount;
689 byte* byteStart = bytes;
690 char* charStart = chars;
692 // For fallback we may need a fallback buffer
693 EncoderFallbackBuffer fallbackBuffer = null;
694 char* charsForFallback;
696 // Get our encoder, but don't clear it yet.
699 charLeftOver = encoder._charLeftOver;
701 // We mustn't have left over fallback data when counting
702 if (encoder.InternalHasFallbackBuffer)
704 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
705 fallbackBuffer = encoder.FallbackBuffer;
706 if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
707 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
709 // Set our internal fallback interesting things.
710 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
715 while (((ch = (fallbackBuffer == null) ?
716 (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) ||
719 // First unwind any fallback
722 // No fallback, maybe we can do it fast
723 #if !NO_FAST_UNICODE_LOOP
724 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
729 #if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned
730 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
732 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
736 // Need -1 to check 2 at a time. If we have an even #, longChars will go
737 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
738 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
739 // We can only go iCount units (limited by shorter of char or byte buffers.
740 ulong* longEnd = (ulong*)(chars - 3 +
741 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
742 (byteEnd - bytes) >> 1 : charEnd - chars));
744 // Need new char* so we can check 4 at a time
745 ulong* longChars = (ulong*)chars;
746 ulong* longBytes = (ulong*)bytes;
748 while (longChars < longEnd)
750 // See if we potentially have surrogates (0x8000 bit set)
751 // (We're either big endian on a big endian machine or little endian on
752 // a little endian machine so that'll work)
753 if ((0x8000800080008000 & *longChars) != 0)
755 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
756 // 5 bits looks like 11011, then its a high or low surrogate.
757 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
758 // Note that we expect BMP characters to be more common than surrogates
759 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
760 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
762 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
763 // but no clue if they're high or low.
764 // If each of the 4 characters are non-zero, then none are surrogates.
765 if ((uTemp & 0xFFFF000000000000) == 0 ||
766 (uTemp & 0x0000FFFF00000000) == 0 ||
767 (uTemp & 0x00000000FFFF0000) == 0 ||
768 (uTemp & 0x000000000000FFFF) == 0)
770 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
771 // or if there's 1 or 4 surrogates
773 // If they happen to be high/low/high/low, we may as well continue. Check the next
774 // bit to see if its set (low) or not (high) in the right pattern
776 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
778 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
781 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
782 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
784 // Drop out to the slow loop to resolve the surrogates
787 // else they are all surrogates in High/Low/High/Low order, so we can use them.
789 // else none are surrogates, so we can use them.
791 // else all < 0x8000 so we can use them
793 // We can use these 4 chars.
794 *longBytes = *longChars;
799 chars = (char*)longChars;
800 bytes = (byte*)longBytes;
802 if (chars >= charEnd)
805 // Not aligned, but maybe we can still be somewhat faster
806 // Also somehow this optimizes the above loop? It seems to cause something above
807 // to get enregistered, but I haven't figured out how to make that happen without this loop.
808 else if ((charLeftOver == 0) &&
816 (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time
818 (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time
820 (unchecked((int)(bytes)) & 1) == 0)
823 long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ?
824 (byteEnd - bytes) >> 1 : charEnd - chars;
827 char* charOut = ((char*)bytes); // a char* for our output
828 char* tempEnd = chars + iCount - 1; // Our end pointer
830 while (chars < tempEnd)
832 if (*chars >= (char)0xd800 && *chars <= (char)0xdfff)
834 // break for fallback for low surrogate
835 if (*chars >= 0xdc00)
838 // break if next one's not a low surrogate (will do fallback)
839 if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff)
842 // They both exist, use them
844 // If 2nd char is surrogate & this one isn't then only add one
845 else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff)
854 *(charOut + 1) = *(chars + 1);
859 bytes = (byte*)charOut;
861 if (chars >= charEnd)
864 #endif // !NO_FAST_UNICODE_LOOP
866 // No fallback, just get next char
871 // Check for high or low surrogates
872 if (ch >= 0xd800 && ch <= 0xdfff)
874 // Was it a high surrogate?
877 // Its a high surrogate, see if we already had a high surrogate
878 if (charLeftOver > 0)
880 // Unwind the current character, this should be safe because we
881 // don't have leftover data in the fallback, so chars must have
883 Debug.Assert(chars > charStart,
884 "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
887 // Fallback the previous surrogate
888 // Might need to create our fallback buffer
889 if (fallbackBuffer == null)
892 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
894 fallbackBuffer = encoder.FallbackBuffer;
896 // Set our internal fallback interesting things.
897 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
900 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
901 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
902 chars = charsForFallback;
904 charLeftOver = (char)0;
908 // Remember this high surrogate
913 // Its a low surrogate
914 if (charLeftOver == 0)
916 // We'll fall back this one
917 // Might need to create our fallback buffer
918 if (fallbackBuffer == null)
921 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
923 fallbackBuffer = encoder.FallbackBuffer;
925 // Set our internal fallback interesting things.
926 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
929 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
930 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
931 chars = charsForFallback;
935 // Valid surrogate pair, add our charLeftOver
936 if (bytes + 3 >= byteEnd)
938 // Not enough room to add this surrogate pair
939 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
941 // These must have both been from the fallbacks.
942 // Both of these MUST have been from a fallback because if the 1st wasn't
943 // from a fallback, then a high surrogate followed by an illegal char
944 // would've caused the high surrogate to fall back. If a high surrogate
945 // fell back, then it was consumed and both chars came from the fallback.
946 fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate
947 fallbackBuffer.MovePrevious();
951 // If we don't have enough room, then either we should've advanced a while
952 // or we should have bytes==byteStart and throw below
953 Debug.Assert(chars > charStart + 1 || bytes == byteStart,
954 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
955 chars -= 2; // Didn't use either surrogate
957 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
958 charLeftOver = (char)0; // we'll retry it later
959 break; // Didn't throw, but stop 'til next time.
964 *(bytes++) = (byte)(charLeftOver >> 8);
965 *(bytes++) = (byte)charLeftOver;
969 *(bytes++) = (byte)charLeftOver;
970 *(bytes++) = (byte)(charLeftOver >> 8);
973 charLeftOver = (char)0;
975 else if (charLeftOver > 0)
977 // Expected a low surrogate, but this char is normal
979 // Rewind the current character, fallback previous character.
980 // this should be safe because we don't have leftover data in the
981 // fallback, so chars must have advanced already.
982 Debug.Assert(chars > charStart,
983 "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
986 // fallback previous chars
987 // Might need to create our fallback buffer
988 if (fallbackBuffer == null)
991 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
993 fallbackBuffer = encoder.FallbackBuffer;
995 // Set our internal fallback interesting things.
996 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
999 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1000 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
1001 chars = charsForFallback;
1003 // Ignore charLeftOver or throw
1004 charLeftOver = (char)0;
1008 // Ok, we have a char to add
1009 if (bytes + 1 >= byteEnd)
1011 // Couldn't add this char
1012 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1013 fallbackBuffer.MovePrevious(); // Not using this fallback char
1016 // Lonely charLeftOver (from previous call) would've been caught up above,
1017 // so this must be a case where we've already read an input char.
1018 Debug.Assert(chars > charStart,
1019 "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
1020 chars--; // Not using this char
1022 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
1023 break; // didn't throw, just stop
1028 *(bytes++) = (byte)(ch >> 8);
1029 *(bytes++) = (byte)ch;
1033 *(bytes++) = (byte)ch;
1034 *(bytes++) = (byte)(ch >> 8);
1038 // Don't allocate space for left over char
1039 if (charLeftOver > 0)
1041 // If we aren't flushing we need to fall this back
1042 if (encoder == null || encoder.MustFlush)
1046 // Throw it, using our complete character
1047 throw new ArgumentException(
1048 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
1052 // If we have to flush, stick it in fallback and try again
1053 // Might need to create our fallback buffer
1054 if (fallbackBuffer == null)
1056 if (encoder == null)
1057 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
1059 fallbackBuffer = encoder.FallbackBuffer;
1061 // Set our internal fallback interesting things.
1062 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
1065 // If we're not flushing, that'll remember the left over character.
1066 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1067 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
1068 chars = charsForFallback;
1070 charLeftOver = (char)0;
1071 wasHereBefore = true;
1077 // Not flushing, remember it in the encoder
1078 if (encoder != null)
1080 encoder._charLeftOver = charLeftOver;
1081 encoder._charsUsed = (int)(chars - charStart);
1084 // Remember charLeftOver if we must, or clear it if we're flushing
1085 // (charLeftOver should be 0 if we're flushing)
1086 Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0,
1087 "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
1089 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1090 encoder == null || !encoder._throwOnOverflow,
1091 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
1093 // We used to copy it fast, but this doesn't check for surrogates
1094 // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount);
1096 return (int)(bytes - byteStart);
1099 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1101 Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null");
1102 Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0");
1104 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
1106 byte* byteEnd = bytes + count;
1107 byte* byteStart = bytes;
1111 char lastChar = (char)0;
1113 // Start by assuming same # of chars as bytes
1114 int charCount = count >> 1;
1116 // Need -1 to check 2 at a time. If we have an even #, longBytes will go
1117 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
1118 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1119 ulong* longEnd = (ulong*)(byteEnd - 7);
1121 // For fallback we may need a fallback buffer
1122 DecoderFallbackBuffer fallbackBuffer = null;
1124 if (decoder != null)
1126 lastByte = decoder.lastByte;
1127 lastChar = decoder.lastChar;
1129 // Assume extra char if last char was around
1133 // Assume extra char if extra last byte makes up odd # of input bytes
1134 if (lastByte >= 0 && (count & 1) == 1)
1139 // Shouldn't have anything in fallback buffer for GetCharCount
1140 // (don't have to check _throwOnOverflow for count)
1141 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1142 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
1145 while (bytes < byteEnd)
1147 // If we're aligned then maybe we can do it fast
1148 // That'll hurt if we're unaligned because we'll always test but never be aligned
1149 #if !NO_FAST_UNICODE_LOOP
1155 #if BIT64 // win64 has to be long aligned
1156 (unchecked((long)bytes) & 7) == 0 &&
1158 (unchecked((int)bytes) & 3) == 0 &&
1160 lastByte == -1 && lastChar == 0)
1162 // Need new char* so we can check 4 at a time
1163 ulong* longBytes = (ulong*)bytes;
1165 while (longBytes < longEnd)
1167 // See if we potentially have surrogates (0x8000 bit set)
1168 // (We're either big endian on a big endian machine or little endian on
1169 // a little endian machine so that'll work)
1170 if ((0x8000800080008000 & *longBytes) != 0)
1172 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1173 // 5 bits looks like 11011, then its a high or low surrogate.
1174 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1175 // Note that we expect BMP characters to be more common than surrogates
1176 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1177 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1179 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1180 // but no clue if they're high or low.
1181 // If each of the 4 characters are non-zero, then none are surrogates.
1182 if ((uTemp & 0xFFFF000000000000) == 0 ||
1183 (uTemp & 0x0000FFFF00000000) == 0 ||
1184 (uTemp & 0x00000000FFFF0000) == 0 ||
1185 (uTemp & 0x000000000000FFFF) == 0)
1187 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1188 // or if there's 1 or 4 surrogates
1190 // If they happen to be high/low/high/low, we may as well continue. Check the next
1191 // bit to see if its set (low) or not (high) in the right pattern
1193 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
1195 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
1198 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1199 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1201 // Drop out to the slow loop to resolve the surrogates
1204 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1206 // else none are surrogates, so we can use them.
1208 // else all < 0x8000 so we can use them
1210 // We can use these 4 chars.
1214 bytes = (byte*)longBytes;
1216 if (bytes >= byteEnd)
1219 #endif // !NO_FAST_UNICODE_LOOP
1224 lastByte = *bytes++;
1225 if (bytes >= byteEnd) break;
1232 ch = (char)(lastByte << 8 | *(bytes++));
1236 ch = (char)(*(bytes++) << 8 | lastByte);
1240 // See if the char's valid
1241 if (ch >= 0xd800 && ch <= 0xdfff)
1243 // Was it a high surrogate?
1246 // Its a high surrogate, if we had one then do fallback for previous one
1249 // Ignore previous bad high surrogate
1252 // Get fallback for previous high surrogate
1253 // Note we have to reconstruct bytes because some may have been in decoder
1254 byte[] byteBuffer = null;
1257 byteBuffer = new byte[]
1258 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1262 byteBuffer = new byte[]
1263 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1266 if (fallbackBuffer == null)
1268 if (decoder == null)
1269 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1271 fallbackBuffer = decoder.FallbackBuffer;
1273 // Set our internal fallback interesting things.
1274 fallbackBuffer.InternalInitialize(byteStart, null);
1278 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1281 // Ignore the last one which fell back already,
1282 // and remember the new high surrogate
1287 // Its a low surrogate
1290 // Expected a previous high surrogate
1293 // Get fallback for this low surrogate
1294 // Note we have to reconstruct bytes because some may have been in decoder
1295 byte[] byteBuffer = null;
1298 byteBuffer = new byte[]
1299 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1303 byteBuffer = new byte[]
1304 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1307 if (fallbackBuffer == null)
1309 if (decoder == null)
1310 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1312 fallbackBuffer = decoder.FallbackBuffer;
1314 // Set our internal fallback interesting things.
1315 fallbackBuffer.InternalInitialize(byteStart, null);
1318 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1320 // Ignore this one (we already did its fallback)
1324 // Valid surrogate pair, already counted.
1327 else if (lastChar > 0)
1329 // Had a high surrogate, expected a low surrogate
1330 // Un-count the last high surrogate
1333 // fall back the high surrogate.
1334 byte[] byteBuffer = null;
1337 byteBuffer = new byte[]
1338 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1342 byteBuffer = new byte[]
1343 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1346 if (fallbackBuffer == null)
1348 if (decoder == null)
1349 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1351 fallbackBuffer = decoder.FallbackBuffer;
1353 // Set our internal fallback interesting things.
1354 fallbackBuffer.InternalInitialize(byteStart, null);
1357 // Already subtracted high surrogate
1358 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1360 // Not left over now, clear previous high surrogate and continue to add current char
1364 // Valid char, already counted
1367 // Extra space if we can't use decoder
1368 if (decoder == null || decoder.MustFlush)
1372 // No hanging high surrogates allowed, do fallback and remove count for it
1374 byte[] byteBuffer = null;
1377 byteBuffer = new byte[]
1378 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1382 byteBuffer = new byte[]
1383 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1386 if (fallbackBuffer == null)
1388 if (decoder == null)
1389 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1391 fallbackBuffer = decoder.FallbackBuffer;
1393 // Set our internal fallback interesting things.
1394 fallbackBuffer.InternalInitialize(byteStart, null);
1397 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1404 if (fallbackBuffer == null)
1406 if (decoder == null)
1407 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1409 fallbackBuffer = decoder.FallbackBuffer;
1411 // Set our internal fallback interesting things.
1412 fallbackBuffer.InternalInitialize(byteStart, null);
1415 // No hanging odd bytes allowed if must flush
1416 charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes);
1421 // If we had a high surrogate left over, we can't count it
1425 // Shouldn't have anything in fallback buffer for GetCharCount
1426 // (don't have to check _throwOnOverflow for count)
1427 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1428 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
1433 internal override unsafe int GetChars(byte* bytes, int byteCount,
1434 char* chars, int charCount, DecoderNLS baseDecoder)
1436 Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null");
1437 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
1438 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0");
1439 Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null");
1441 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
1445 char lastChar = (char)0;
1447 // Get our decoder (but don't clear it yet)
1448 if (decoder != null)
1450 lastByte = decoder.lastByte;
1451 lastChar = decoder.lastChar;
1453 // Shouldn't have anything in fallback buffer for GetChars
1454 // (don't have to check _throwOnOverflow for chars)
1455 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1456 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
1459 // For fallback we may need a fallback buffer
1460 DecoderFallbackBuffer fallbackBuffer = null;
1461 char* charsForFallback;
1463 byte* byteEnd = bytes + byteCount;
1464 char* charEnd = chars + charCount;
1465 byte* byteStart = bytes;
1466 char* charStart = chars;
1468 while (bytes < byteEnd)
1470 // If we're aligned then maybe we can do it fast
1471 // That'll hurt if we're unaligned because we'll always test but never be aligned
1472 #if !NO_FAST_UNICODE_LOOP
1478 #if BIT64 // win64 has to be long aligned
1479 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
1481 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
1483 lastByte == -1 && lastChar == 0)
1485 // Need -1 to check 2 at a time. If we have an even #, longChars will go
1486 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
1487 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1488 // We can only go iCount units (limited by shorter of char or byte buffers.
1489 ulong* longEnd = (ulong*)(bytes - 7 +
1490 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
1491 (byteEnd - bytes) : (charEnd - chars) << 1));
1493 // Need new char* so we can check 4 at a time
1494 ulong* longBytes = (ulong*)bytes;
1495 ulong* longChars = (ulong*)chars;
1497 while (longBytes < longEnd)
1499 // See if we potentially have surrogates (0x8000 bit set)
1500 // (We're either big endian on a big endian machine or little endian on
1501 // a little endian machine so that'll work)
1502 if ((0x8000800080008000 & *longBytes) != 0)
1504 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1505 // 5 bits looks like 11011, then its a high or low surrogate.
1506 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1507 // Note that we expect BMP characters to be more common than surrogates
1508 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1509 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1511 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1512 // but no clue if they're high or low.
1513 // If each of the 4 characters are non-zero, then none are surrogates.
1514 if ((uTemp & 0xFFFF000000000000) == 0 ||
1515 (uTemp & 0x0000FFFF00000000) == 0 ||
1516 (uTemp & 0x00000000FFFF0000) == 0 ||
1517 (uTemp & 0x000000000000FFFF) == 0)
1519 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1520 // or if there's 1 or 4 surrogates
1522 // If they happen to be high/low/high/low, we may as well continue. Check the next
1523 // bit to see if its set (low) or not (high) in the right pattern
1525 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
1527 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
1530 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1531 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1533 // Drop out to the slow loop to resolve the surrogates
1536 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1538 // else none are surrogates, so we can use them.
1540 // else all < 0x8000 so we can use them
1542 // We can use these 4 chars.
1543 *longChars = *longBytes;
1548 chars = (char*)longChars;
1549 bytes = (byte*)longBytes;
1551 if (bytes >= byteEnd)
1554 #endif // !NO_FAST_UNICODE_LOOP
1559 lastByte = *bytes++;
1567 ch = (char)(lastByte << 8 | *(bytes++));
1571 ch = (char)(*(bytes++) << 8 | lastByte);
1575 // See if the char's valid
1576 if (ch >= 0xd800 && ch <= 0xdfff)
1578 // Was it a high surrogate?
1581 // Its a high surrogate, if we had one then do fallback for previous one
1584 // Get fallback for previous high surrogate
1585 // Note we have to reconstruct bytes because some may have been in decoder
1586 byte[] byteBuffer = null;
1589 byteBuffer = new byte[]
1590 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1594 byteBuffer = new byte[]
1595 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1598 if (fallbackBuffer == null)
1600 if (decoder == null)
1601 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1603 fallbackBuffer = decoder.FallbackBuffer;
1605 // Set our internal fallback interesting things.
1606 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1609 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1610 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1611 chars = charsForFallback;
1613 if (!fallbackResult)
1615 // couldn't fall back lonely surrogate
1616 // We either advanced bytes or chars should == charStart and throw below
1617 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1618 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
1619 bytes -= 2; // didn't use these 2 bytes
1620 fallbackBuffer.InternalReset();
1621 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1622 break; // couldn't fallback but didn't throw
1626 // Ignore the previous high surrogate which fell back already,
1627 // yet remember the current high surrogate for next time.
1632 // Its a low surrogate
1635 // Expected a previous high surrogate
1636 // Get fallback for this low surrogate
1637 // Note we have to reconstruct bytes because some may have been in decoder
1638 byte[] byteBuffer = null;
1641 byteBuffer = new byte[]
1642 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1646 byteBuffer = new byte[]
1647 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1650 if (fallbackBuffer == null)
1652 if (decoder == null)
1653 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1655 fallbackBuffer = decoder.FallbackBuffer;
1657 // Set our internal fallback interesting things.
1658 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1661 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1662 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1663 chars = charsForFallback;
1665 if (!fallbackResult)
1667 // couldn't fall back lonely surrogate
1668 // We either advanced bytes or chars should == charStart and throw below
1669 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1670 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
1671 bytes -= 2; // didn't use these 2 bytes
1672 fallbackBuffer.InternalReset();
1673 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1674 break; // couldn't fallback but didn't throw
1677 // Didn't throw, ignore this one (we already did its fallback)
1681 // Valid surrogate pair, add our lastChar (will need 2 chars)
1682 if (chars >= charEnd - 1)
1684 // couldn't find room for this surrogate pair
1685 // We either advanced bytes or chars should == charStart and throw below
1686 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1687 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
1688 bytes -= 2; // didn't use these 2 bytes
1689 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1690 // Leave lastChar for next call to Convert()
1691 break; // couldn't fallback but didn't throw
1694 *chars++ = lastChar;
1697 else if (lastChar > 0)
1699 // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
1700 byte[] byteBuffer = null;
1703 byteBuffer = new byte[]
1704 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1708 byteBuffer = new byte[]
1709 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1712 if (fallbackBuffer == null)
1714 if (decoder == null)
1715 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1717 fallbackBuffer = decoder.FallbackBuffer;
1719 // Set our internal fallback interesting things.
1720 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1723 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1724 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1725 chars = charsForFallback;
1727 if (!fallbackResult)
1729 // couldn't fall back high surrogate, or char that would be next
1730 // We either advanced bytes or chars should == charStart and throw below
1731 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1732 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
1733 bytes -= 2; // didn't use these 2 bytes
1734 fallbackBuffer.InternalReset();
1735 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1736 break; // couldn't fallback but didn't throw
1739 // Not left over now, clear previous high surrogate and continue to add current char
1743 // Valid char, room for it?
1744 if (chars >= charEnd)
1746 // 2 bytes couldn't fall back
1747 // We either advanced bytes or chars should == charStart and throw below
1748 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1749 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
1750 bytes -= 2; // didn't use these bytes
1751 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1752 break; // couldn't fallback but didn't throw
1759 // Remember our decoder if we must
1760 if (decoder == null || decoder.MustFlush)
1764 // No hanging high surrogates allowed, do fallback and remove count for it
1765 byte[] byteBuffer = null;
1768 byteBuffer = new byte[]
1769 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1773 byteBuffer = new byte[]
1774 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1777 if (fallbackBuffer == null)
1779 if (decoder == null)
1780 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1782 fallbackBuffer = decoder.FallbackBuffer;
1784 // Set our internal fallback interesting things.
1785 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1788 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1789 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1790 chars = charsForFallback;
1792 if (!fallbackResult)
1794 // 2 bytes couldn't fall back
1795 // We either advanced bytes or chars should == charStart and throw below
1796 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1797 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
1798 bytes -= 2; // didn't use these bytes
1800 bytes--; // had an extra last byte hanging around
1801 fallbackBuffer.InternalReset();
1802 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1803 // We'll remember these in our decoder though
1810 // done with this one
1816 if (fallbackBuffer == null)
1818 if (decoder == null)
1819 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1821 fallbackBuffer = decoder.FallbackBuffer;
1823 // Set our internal fallback interesting things.
1824 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1827 // No hanging odd bytes allowed if must flush
1828 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1829 bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback);
1830 chars = charsForFallback;
1832 if (!fallbackResult)
1834 // odd byte couldn't fall back
1835 bytes--; // didn't use this byte
1836 fallbackBuffer.InternalReset();
1837 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1838 // didn't throw, but we'll remember it in the decoder
1843 // Didn't fail, clear buffer
1850 // Remember our decoder if we must
1851 if (decoder != null)
1853 Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)),
1854 "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
1855 // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
1858 decoder._bytesUsed = (int)(bytes - byteStart);
1859 decoder.lastChar = lastChar;
1860 decoder.lastByte = lastByte;
1863 // Used to do this the old way
1864 // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount);
1866 // Shouldn't have anything in fallback buffer for GetChars
1867 // (don't have to check _throwOnOverflow for count or chars)
1868 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1869 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
1871 return (int)(chars - charStart);
1875 public override System.Text.Encoder GetEncoder()
1877 return new EncoderNLS(this);
1881 public override System.Text.Decoder GetDecoder()
1883 return new UnicodeEncoding.Decoder(this);
1887 public override byte[] GetPreamble()
1891 // Note - we must allocate new byte[]'s here to prevent someone
1892 // from modifying a cached byte[].
1894 return new byte[2] { 0xfe, 0xff };
1896 return new byte[2] { 0xff, 0xfe };
1898 return Array.Empty<Byte>();
1902 public override int GetMaxByteCount(int charCount)
1905 throw new ArgumentOutOfRangeException(nameof(charCount),
1906 SR.ArgumentOutOfRange_NeedNonNegNum);
1907 Contract.EndContractBlock();
1909 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
1910 long byteCount = (long)charCount + 1;
1912 if (EncoderFallback.MaxCharCount > 1)
1913 byteCount *= EncoderFallback.MaxCharCount;
1918 if (byteCount > 0x7fffffff)
1919 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
1921 return (int)byteCount;
1925 public override int GetMaxCharCount(int byteCount)
1928 throw new ArgumentOutOfRangeException(nameof(byteCount),
1929 SR.ArgumentOutOfRange_NeedNonNegNum);
1930 Contract.EndContractBlock();
1932 // long because byteCount could be biggest int.
1933 // 1 char per 2 bytes. Round up in case 1 left over in decoder.
1934 // Round up using &1 in case byteCount is max size
1935 // Might also need an extra 1 if there's a left over high surrogate in the decoder.
1936 long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1;
1938 // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that)
1939 if (DecoderFallback.MaxCharCount > 1)
1940 charCount *= DecoderFallback.MaxCharCount;
1942 if (charCount > 0x7fffffff)
1943 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
1945 return (int)charCount;
1949 public override bool Equals(Object value)
1951 UnicodeEncoding that = value as UnicodeEncoding;
1955 // Big Endian Unicode has different code page (1201) than small Endian one (1200),
1956 // so we still have to check _codePage here.
1958 return (CodePage == that.CodePage) &&
1959 byteOrderMark == that.byteOrderMark &&
1960 // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
1961 bigEndian == that.bigEndian &&
1962 (EncoderFallback.Equals(that.EncoderFallback)) &&
1963 (DecoderFallback.Equals(that.DecoderFallback));
1968 public override int GetHashCode()
1970 return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
1971 (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0);
1974 private sealed class Decoder : System.Text.DecoderNLS
1976 internal int lastByte = -1;
1977 internal char lastChar = '\0';
1979 public Decoder(UnicodeEncoding encoding) : base(encoding)
1984 public override void Reset()
1988 if (_fallbackBuffer != null)
1989 _fallbackBuffer.Reset();
1992 // Anything left in our decoder?
1993 internal override bool HasState
1997 return (this.lastByte != -1 || this.lastChar != '\0');