1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
10 using System.Globalization;
11 using System.Runtime.Serialization;
12 using System.Diagnostics;
13 using System.Diagnostics.Contracts;
17 public class UnicodeEncoding : Encoding
19 // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
20 // The initialization code will not be run until a static member of the class is referenced
21 internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true);
22 internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true);
24 [OptionalField(VersionAdded = 2)]
25 internal bool isThrowException = false;
27 internal bool bigEndian = false;
28 internal bool byteOrderMark = true;
30 // Unicode version 2.0 character size in bytes
31 public const int CharSize = 2;
34 public UnicodeEncoding()
40 public UnicodeEncoding(bool bigEndian, bool byteOrderMark)
41 : this(bigEndian, byteOrderMark, false)
46 public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
47 : base(bigEndian ? 1201 : 1200) //Set the data item.
49 this.isThrowException = throwOnInvalidBytes;
50 this.bigEndian = bigEndian;
51 this.byteOrderMark = byteOrderMark;
53 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
54 if (this.isThrowException)
55 SetDefaultFallbacks();
60 private void OnDeserializing(StreamingContext ctx)
62 // In Everett it is false. Whidbey will overwrite this value.
63 isThrowException = false;
65 #endregion Serialization
67 internal override void SetDefaultFallbacks()
69 // For UTF-X encodings, we use a replacement fallback with an empty string
70 if (this.isThrowException)
72 this.encoderFallback = EncoderFallback.ExceptionFallback;
73 this.decoderFallback = DecoderFallback.ExceptionFallback;
77 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
78 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
82 // The following methods are copied from EncodingNLS.cs.
83 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
84 // These should be kept in sync for the following classes:
85 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
88 // Returns the number of bytes required to encode a range of characters in
91 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
92 // So if you fix this, fix the others. Currently those include:
93 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
94 // parent method is safe
96 public override unsafe int GetByteCount(char[] chars, int index, int count)
98 // Validate input parameters
100 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
102 if (index < 0 || count < 0)
103 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
105 if (chars.Length - index < count)
106 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
107 Contract.EndContractBlock();
109 // If no input, return 0, avoid fixed empty array problem
113 // Just call the pointer version
114 fixed (char* pChars = chars)
115 return GetByteCount(pChars + index, count, null);
118 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
119 // So if you fix this, fix the others. Currently those include:
120 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
121 // parent method is safe
123 public override unsafe int GetByteCount(String s)
127 throw new ArgumentNullException("s");
128 Contract.EndContractBlock();
130 fixed (char* pChars = s)
131 return GetByteCount(pChars, s.Length, null);
134 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
135 // So if you fix this, fix the others. Currently those include:
136 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
138 [CLSCompliant(false)]
139 public override unsafe int GetByteCount(char* chars, int count)
141 // Validate Parameters
143 throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
146 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
147 Contract.EndContractBlock();
149 // Call it with empty encoder
150 return GetByteCount(chars, count, null);
153 // Parent method is safe.
154 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
155 // So if you fix this, fix the others. Currently those include:
156 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
158 public override unsafe int GetBytes(String s, int charIndex, int charCount,
159 byte[] bytes, int byteIndex)
161 if (s == null || bytes == null)
162 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
164 if (charIndex < 0 || charCount < 0)
165 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
167 if (s.Length - charIndex < charCount)
168 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
170 if (byteIndex < 0 || byteIndex > bytes.Length)
171 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
172 Contract.EndContractBlock();
174 int byteCount = bytes.Length - byteIndex;
176 // Fixed doesn't like 0 length arrays.
177 if (bytes.Length == 0)
180 fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
181 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
184 // Encodes a range of characters in a character array into a range of bytes
185 // in a byte array. An exception occurs if the byte array is not large
186 // enough to hold the complete encoding of the characters. The
187 // GetByteCount method can be used to determine the exact number of
188 // bytes that will be produced for a given range of characters.
189 // Alternatively, the GetMaxByteCount method can be used to
190 // determine the maximum number of bytes that will be produced for a given
191 // number of characters, regardless of the actual character values.
193 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
194 // So if you fix this, fix the others. Currently those include:
195 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
196 // parent method is safe
198 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
199 byte[] bytes, int byteIndex)
201 // Validate parameters
202 if (chars == null || bytes == null)
203 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
205 if (charIndex < 0 || charCount < 0)
206 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
208 if (chars.Length - charIndex < charCount)
209 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
211 if (byteIndex < 0 || byteIndex > bytes.Length)
212 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
213 Contract.EndContractBlock();
215 // If nothing to encode return 0, avoid fixed problem
219 // Just call pointer version
220 int byteCount = bytes.Length - byteIndex;
222 // Fixed doesn't like 0 length arrays.
223 if (bytes.Length == 0)
226 fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
227 // Remember that byteCount is # to decode, not size of array.
228 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
231 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
232 // So if you fix this, fix the others. Currently those include:
233 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
235 [CLSCompliant(false)]
236 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
238 // Validate Parameters
239 if (bytes == null || chars == null)
240 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
242 if (charCount < 0 || byteCount < 0)
243 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
244 Contract.EndContractBlock();
246 return GetBytes(chars, charCount, bytes, byteCount, null);
249 // Returns the number of characters produced by decoding a range of bytes
252 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
253 // So if you fix this, fix the others. Currently those include:
254 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
255 // parent method is safe
257 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
259 // Validate Parameters
261 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
263 if (index < 0 || count < 0)
264 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
266 if (bytes.Length - index < count)
267 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
268 Contract.EndContractBlock();
270 // If no input just return 0, fixed doesn't like 0 length arrays
274 // Just call pointer version
275 fixed (byte* pBytes = bytes)
276 return GetCharCount(pBytes + index, count, null);
279 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
280 // So if you fix this, fix the others. Currently those include:
281 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
283 [CLSCompliant(false)]
284 public override unsafe int GetCharCount(byte* bytes, int count)
286 // Validate Parameters
288 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
291 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
292 Contract.EndContractBlock();
294 return GetCharCount(bytes, count, null);
297 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
298 // So if you fix this, fix the others. Currently those include:
299 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
300 // parent method is safe
302 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
303 char[] chars, int charIndex)
305 // Validate Parameters
306 if (bytes == null || chars == null)
307 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
309 if (byteIndex < 0 || byteCount < 0)
310 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
312 if ( bytes.Length - byteIndex < byteCount)
313 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
315 if (charIndex < 0 || charIndex > chars.Length)
316 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
317 Contract.EndContractBlock();
319 // If no input, return 0 & avoid fixed problem
323 // Just call pointer version
324 int charCount = chars.Length - charIndex;
326 // Fixed doesn't like 0 length arrays.
327 if (chars.Length == 0)
330 fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
331 // Remember that charCount is # to decode, not size of array
332 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
335 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
336 // So if you fix this, fix the others. Currently those include:
337 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
339 [CLSCompliant(false)]
340 public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
342 // Validate Parameters
343 if (bytes == null || chars == null)
344 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
346 if (charCount < 0 || byteCount < 0)
347 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
348 Contract.EndContractBlock();
350 return GetChars(bytes, byteCount, chars, charCount, null);
353 // Returns a string containing the decoded representation of a range of
354 // bytes in a byte array.
356 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
357 // So if you fix this, fix the others. Currently those include:
358 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
359 // parent method is safe
361 public override unsafe string GetString(byte[] bytes, int index, int count)
363 // Validate Parameters
365 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
367 if (index < 0 || count < 0)
368 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
370 if (bytes.Length - index < count)
371 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
372 Contract.EndContractBlock();
374 // Avoid problems with empty input buffer
375 if (count == 0) return String.Empty;
377 fixed (byte* pBytes = bytes)
378 return String.CreateStringFromEncoding(
379 pBytes + index, count, this);
383 // End of standard methods copied from EncodingNLS.cs
386 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder)
388 Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null");
389 Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0");
391 // Start by assuming each char gets 2 bytes
392 int byteCount = count << 1;
394 // Check for overflow in byteCount
395 // (If they were all invalid chars, this would actually be wrong,
396 // but that's a ridiculously large # so we're not concerned about that case)
398 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
400 char* charStart = chars;
401 char* charEnd = chars + count;
402 char charLeftOver = (char)0;
404 bool wasHereBefore = false;
406 // Need -1 to check 2 at a time. If we have an even #, longChars will go
407 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
408 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
409 ulong* longEnd = (ulong*)(charEnd - 3);
411 // For fallback we may need a fallback buffer
412 EncoderFallbackBuffer fallbackBuffer = null;
413 char* charsForFallback;
417 charLeftOver = encoder.charLeftOver;
419 // Assume extra bytes to encode charLeftOver if it existed
420 if (charLeftOver > 0)
423 // We mustn't have left over fallback data when counting
424 if (encoder.InternalHasFallbackBuffer)
426 fallbackBuffer = encoder.FallbackBuffer;
427 if (fallbackBuffer.Remaining > 0)
428 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
430 // Set our internal fallback interesting things.
431 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
438 while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
440 // First unwind any fallback
443 // No fallback, maybe we can do it fast
444 #if !NO_FAST_UNICODE_LOOP
445 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
451 #if BIT64 // 64 bit CPU needs to be long aligned for this to work.
452 charLeftOver == 0 && (unchecked((long)chars) & 7) == 0)
454 charLeftOver == 0 && (unchecked((int)chars) & 3) == 0)
457 // Need new char* so we can check 4 at a time
458 ulong* longChars = (ulong*)chars;
460 while (longChars < longEnd)
462 // See if we potentially have surrogates (0x8000 bit set)
463 // (We're either big endian on a big endian machine or little endian on
464 // a little endian machine so this'll work)
465 if ((0x8000800080008000 & *longChars) != 0)
467 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
468 // 5 bits looks like 11011, then its a high or low surrogate.
469 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
470 // Note that we expect BMP characters to be more common than surrogates
471 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
472 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
474 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
475 // but no clue if they're high or low.
476 // If each of the 4 characters are non-zero, then none are surrogates.
477 if ((uTemp & 0xFFFF000000000000) == 0 ||
478 (uTemp & 0x0000FFFF00000000) == 0 ||
479 (uTemp & 0x00000000FFFF0000) == 0 ||
480 (uTemp & 0x000000000000FFFF) == 0)
482 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
483 // or if there's 1 or 4 surrogates
485 // If they happen to be high/low/high/low, we may as well continue. Check the next
486 // bit to see if its set (low) or not (high) in the right pattern
488 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
490 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
493 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
494 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
496 // Drop out to the slow loop to resolve the surrogates
499 // else they are all surrogates in High/Low/High/Low order, so we can use them.
501 // else none are surrogates, so we can use them.
503 // else all < 0x8000 so we can use them
505 // We already counted these four chars, go to next long.
509 chars = (char*)longChars;
511 if (chars >= charEnd)
514 #endif // !NO_FAST_UNICODE_LOOP
516 // No fallback, just get next char
522 // We weren't preallocating fallback space.
526 // Check for high or low surrogates
527 if (ch >= 0xd800 && ch <= 0xdfff)
529 // Was it a high surrogate?
532 // Its a high surrogate, if we already had a high surrogate do its fallback
533 if (charLeftOver > 0)
535 // Unwind the current character, this should be safe because we
536 // don't have leftover data in the fallback, so chars must have
538 Debug.Assert(chars > charStart,
539 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
542 // If previous high surrogate deallocate 2 bytes
545 // Fallback the previous surrogate
546 // Need to initialize fallback buffer?
547 if (fallbackBuffer == null)
550 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
552 fallbackBuffer = encoder.FallbackBuffer;
554 // Set our internal fallback interesting things.
555 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
558 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
559 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
560 chars = charsForFallback;
562 // Now no high surrogate left over
563 charLeftOver = (char)0;
567 // Remember this high surrogate
573 // Its a low surrogate
574 if (charLeftOver == 0)
576 // Expected a previous high surrogate.
577 // Don't count this one (we'll count its fallback if necessary)
581 // Need to initialize fallback buffer?
582 if (fallbackBuffer == null)
585 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
587 fallbackBuffer = encoder.FallbackBuffer;
589 // Set our internal fallback interesting things.
590 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
592 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
593 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
594 chars = charsForFallback;
598 // Valid surrogate pair, add our charLeftOver
599 charLeftOver = (char)0;
602 else if (charLeftOver > 0)
604 // Expected a low surrogate, but this char is normal
606 // Rewind the current character, fallback previous character.
607 // this should be safe because we don't have leftover data in the
608 // fallback, so chars must have advanced already.
609 Debug.Assert(chars > charStart,
610 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
613 // fallback previous chars
614 // Need to initialize fallback buffer?
615 if (fallbackBuffer == null)
618 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
620 fallbackBuffer = encoder.FallbackBuffer;
622 // Set our internal fallback interesting things.
623 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
625 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
626 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
627 chars = charsForFallback;
629 // Ignore charLeftOver or throw
631 charLeftOver = (char)0;
636 // Ok we had something to add (already counted)
639 // Don't allocate space for left over char
640 if (charLeftOver > 0)
644 // If we have to flush, stick it in fallback and try again
645 if (encoder == null || encoder.MustFlush)
649 // Throw it, using our complete character
650 throw new ArgumentException(
651 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
655 // Need to initialize fallback buffer?
656 if (fallbackBuffer == null)
659 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
661 fallbackBuffer = encoder.FallbackBuffer;
663 // Set our internal fallback interesting things.
664 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
666 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
667 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
668 chars = charsForFallback;
669 charLeftOver = (char)0;
670 wasHereBefore = true;
676 // Shouldn't have anything in fallback buffer for GetByteCount
677 // (don't have to check m_throwOnOverflow for count)
678 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
679 "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
681 // Don't remember fallbackBuffer.encoder for counting
685 internal override unsafe int GetBytes(char* chars, int charCount,
686 byte* bytes, int byteCount, EncoderNLS encoder)
688 Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null");
689 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
690 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
691 Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null");
693 char charLeftOver = (char)0;
695 bool wasHereBefore = false;
698 byte* byteEnd = bytes + byteCount;
699 char* charEnd = chars + charCount;
700 byte* byteStart = bytes;
701 char* charStart = chars;
703 // For fallback we may need a fallback buffer
704 EncoderFallbackBuffer fallbackBuffer = null;
705 char* charsForFallback;
707 // Get our encoder, but don't clear it yet.
710 charLeftOver = encoder.charLeftOver;
712 // We mustn't have left over fallback data when counting
713 if (encoder.InternalHasFallbackBuffer)
715 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
716 fallbackBuffer = encoder.FallbackBuffer;
717 if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
718 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
720 // Set our internal fallback interesting things.
721 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
726 while (((ch = (fallbackBuffer == null) ?
727 (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) ||
730 // First unwind any fallback
733 // No fallback, maybe we can do it fast
734 #if !NO_FAST_UNICODE_LOOP
735 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
740 #if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned
741 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
743 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
747 // Need -1 to check 2 at a time. If we have an even #, longChars will go
748 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
749 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
750 // We can only go iCount units (limited by shorter of char or byte buffers.
751 ulong* longEnd = (ulong*)(chars - 3 +
752 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
753 (byteEnd - bytes) >> 1 : charEnd - chars));
755 // Need new char* so we can check 4 at a time
756 ulong* longChars = (ulong*)chars;
757 ulong* longBytes = (ulong*)bytes;
759 while (longChars < longEnd)
761 // See if we potentially have surrogates (0x8000 bit set)
762 // (We're either big endian on a big endian machine or little endian on
763 // a little endian machine so this'll work)
764 if ((0x8000800080008000 & *longChars) != 0)
766 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
767 // 5 bits looks like 11011, then its a high or low surrogate.
768 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
769 // Note that we expect BMP characters to be more common than surrogates
770 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
771 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
773 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
774 // but no clue if they're high or low.
775 // If each of the 4 characters are non-zero, then none are surrogates.
776 if ((uTemp & 0xFFFF000000000000) == 0 ||
777 (uTemp & 0x0000FFFF00000000) == 0 ||
778 (uTemp & 0x00000000FFFF0000) == 0 ||
779 (uTemp & 0x000000000000FFFF) == 0)
781 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
782 // or if there's 1 or 4 surrogates
784 // If they happen to be high/low/high/low, we may as well continue. Check the next
785 // bit to see if its set (low) or not (high) in the right pattern
787 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
789 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
792 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
793 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
795 // Drop out to the slow loop to resolve the surrogates
798 // else they are all surrogates in High/Low/High/Low order, so we can use them.
800 // else none are surrogates, so we can use them.
802 // else all < 0x8000 so we can use them
804 // We can use these 4 chars.
805 *longBytes = *longChars;
810 chars = (char*)longChars;
811 bytes = (byte*)longBytes;
813 if (chars >= charEnd)
816 // Not aligned, but maybe we can still be somewhat faster
817 // Also somehow this optimizes the above loop? It seems to cause something above
818 // to get enregistered, but I haven't figured out how to make that happen without this loop.
819 else if ((charLeftOver == 0) &&
827 (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time
829 (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time
831 (unchecked((int)(bytes)) & 1) == 0)
834 long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ?
835 (byteEnd - bytes) >> 1 : charEnd - chars;
838 char* charOut = ((char*)bytes); // a char* for our output
839 char* tempEnd = chars + iCount - 1; // Our end pointer
841 while (chars < tempEnd)
843 if (*chars >= (char)0xd800 && *chars <= (char)0xdfff)
845 // break for fallback for low surrogate
846 if (*chars >= 0xdc00)
849 // break if next one's not a low surrogate (will do fallback)
850 if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff)
853 // They both exist, use them
855 // If 2nd char is surrogate & this one isn't then only add one
856 else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff)
865 *(charOut + 1) = *(chars + 1);
870 bytes = (byte*)charOut;
872 if (chars >= charEnd)
875 #endif // !NO_FAST_UNICODE_LOOP
877 // No fallback, just get next char
882 // Check for high or low surrogates
883 if (ch >= 0xd800 && ch <= 0xdfff)
885 // Was it a high surrogate?
888 // Its a high surrogate, see if we already had a high surrogate
889 if (charLeftOver > 0)
891 // Unwind the current character, this should be safe because we
892 // don't have leftover data in the fallback, so chars must have
894 Debug.Assert(chars > charStart,
895 "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
898 // Fallback the previous surrogate
899 // Might need to create our fallback buffer
900 if (fallbackBuffer == null)
903 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
905 fallbackBuffer = encoder.FallbackBuffer;
907 // Set our internal fallback interesting things.
908 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
911 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
912 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
913 chars = charsForFallback;
915 charLeftOver = (char)0;
919 // Remember this high surrogate
924 // Its a low surrogate
925 if (charLeftOver == 0)
927 // We'll fall back this one
928 // Might need to create our fallback buffer
929 if (fallbackBuffer == null)
932 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
934 fallbackBuffer = encoder.FallbackBuffer;
936 // Set our internal fallback interesting things.
937 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
940 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
941 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
942 chars = charsForFallback;
946 // Valid surrogate pair, add our charLeftOver
947 if (bytes + 3 >= byteEnd)
949 // Not enough room to add this surrogate pair
950 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
952 // These must have both been from the fallbacks.
953 // Both of these MUST have been from a fallback because if the 1st wasn't
954 // from a fallback, then a high surrogate followed by an illegal char
955 // would've caused the high surrogate to fall back. If a high surrogate
956 // fell back, then it was consumed and both chars came from the fallback.
957 fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate
958 fallbackBuffer.MovePrevious();
962 // If we don't have enough room, then either we should've advanced a while
963 // or we should have bytes==byteStart and throw below
964 Debug.Assert(chars > charStart + 1 || bytes == byteStart,
965 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
966 chars -= 2; // Didn't use either surrogate
968 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
969 charLeftOver = (char)0; // we'll retry it later
970 break; // Didn't throw, but stop 'til next time.
975 *(bytes++) = (byte)(charLeftOver >> 8);
976 *(bytes++) = (byte)charLeftOver;
980 *(bytes++) = (byte)charLeftOver;
981 *(bytes++) = (byte)(charLeftOver >> 8);
984 charLeftOver = (char)0;
986 else if (charLeftOver > 0)
988 // Expected a low surrogate, but this char is normal
990 // Rewind the current character, fallback previous character.
991 // this should be safe because we don't have leftover data in the
992 // fallback, so chars must have advanced already.
993 Debug.Assert(chars > charStart,
994 "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
997 // fallback previous chars
998 // Might need to create our fallback buffer
999 if (fallbackBuffer == null)
1001 if (encoder == null)
1002 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
1004 fallbackBuffer = encoder.FallbackBuffer;
1006 // Set our internal fallback interesting things.
1007 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
1010 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1011 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
1012 chars = charsForFallback;
1014 // Ignore charLeftOver or throw
1015 charLeftOver = (char)0;
1019 // Ok, we have a char to add
1020 if (bytes + 1 >= byteEnd)
1022 // Couldn't add this char
1023 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1024 fallbackBuffer.MovePrevious(); // Not using this fallback char
1027 // Lonely charLeftOver (from previous call) would've been caught up above,
1028 // so this must be a case where we've already read an input char.
1029 Debug.Assert(chars > charStart,
1030 "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
1031 chars--; // Not using this char
1033 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
1034 break; // didn't throw, just stop
1039 *(bytes++) = (byte)(ch >> 8);
1040 *(bytes++) = (byte)ch;
1044 *(bytes++) = (byte)ch;
1045 *(bytes++) = (byte)(ch >> 8);
1049 // Don't allocate space for left over char
1050 if (charLeftOver > 0)
1052 // If we aren't flushing we need to fall this back
1053 if (encoder == null || encoder.MustFlush)
1057 // Throw it, using our complete character
1058 throw new ArgumentException(
1059 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
1063 // If we have to flush, stick it in fallback and try again
1064 // Might need to create our fallback buffer
1065 if (fallbackBuffer == null)
1067 if (encoder == null)
1068 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
1070 fallbackBuffer = encoder.FallbackBuffer;
1072 // Set our internal fallback interesting things.
1073 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
1076 // If we're not flushing, this'll remember the left over character.
1077 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1078 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
1079 chars = charsForFallback;
1081 charLeftOver = (char)0;
1082 wasHereBefore = true;
1088 // Not flushing, remember it in the encoder
1089 if (encoder != null)
1091 encoder.charLeftOver = charLeftOver;
1092 encoder.m_charsUsed = (int)(chars - charStart);
1095 // Remember charLeftOver if we must, or clear it if we're flushing
1096 // (charLeftOver should be 0 if we're flushing)
1097 Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0,
1098 "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
1100 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1101 encoder == null || !encoder.m_throwOnOverflow,
1102 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
1104 // We used to copy it fast, but this doesn't check for surrogates
1105 // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount);
1107 return (int)(bytes - byteStart);
1110 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1112 Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null");
1113 Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0");
1115 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
1117 byte* byteEnd = bytes + count;
1118 byte* byteStart = bytes;
1122 char lastChar = (char)0;
1124 // Start by assuming same # of chars as bytes
1125 int charCount = count >> 1;
1127 // Need -1 to check 2 at a time. If we have an even #, longBytes will go
1128 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
1129 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1130 ulong* longEnd = (ulong*)(byteEnd - 7);
1132 // For fallback we may need a fallback buffer
1133 DecoderFallbackBuffer fallbackBuffer = null;
1135 if (decoder != null)
1137 lastByte = decoder.lastByte;
1138 lastChar = decoder.lastChar;
1140 // Assume extra char if last char was around
1144 // Assume extra char if extra last byte makes up odd # of input bytes
1145 if (lastByte >= 0 && (count & 1) == 1)
1150 // Shouldn't have anything in fallback buffer for GetCharCount
1151 // (don't have to check m_throwOnOverflow for count)
1152 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1153 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
1156 while (bytes < byteEnd)
1158 // If we're aligned then maybe we can do it fast
1159 // This'll hurt if we're unaligned because we'll always test but never be aligned
1160 #if !NO_FAST_UNICODE_LOOP
1166 #if BIT64 // win64 has to be long aligned
1167 (unchecked((long)bytes) & 7) == 0 &&
1169 (unchecked((int)bytes) & 3) == 0 &&
1171 lastByte == -1 && lastChar == 0)
1173 // Need new char* so we can check 4 at a time
1174 ulong* longBytes = (ulong*)bytes;
1176 while (longBytes < longEnd)
1178 // See if we potentially have surrogates (0x8000 bit set)
1179 // (We're either big endian on a big endian machine or little endian on
1180 // a little endian machine so this'll work)
1181 if ((0x8000800080008000 & *longBytes) != 0)
1183 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1184 // 5 bits looks like 11011, then its a high or low surrogate.
1185 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1186 // Note that we expect BMP characters to be more common than surrogates
1187 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1188 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1190 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1191 // but no clue if they're high or low.
1192 // If each of the 4 characters are non-zero, then none are surrogates.
1193 if ((uTemp & 0xFFFF000000000000) == 0 ||
1194 (uTemp & 0x0000FFFF00000000) == 0 ||
1195 (uTemp & 0x00000000FFFF0000) == 0 ||
1196 (uTemp & 0x000000000000FFFF) == 0)
1198 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1199 // or if there's 1 or 4 surrogates
1201 // If they happen to be high/low/high/low, we may as well continue. Check the next
1202 // bit to see if its set (low) or not (high) in the right pattern
1204 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
1206 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
1209 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1210 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1212 // Drop out to the slow loop to resolve the surrogates
1215 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1217 // else none are surrogates, so we can use them.
1219 // else all < 0x8000 so we can use them
1221 // We can use these 4 chars.
1225 bytes = (byte*)longBytes;
1227 if (bytes >= byteEnd)
1230 #endif // !NO_FAST_UNICODE_LOOP
1235 lastByte = *bytes++;
1236 if (bytes >= byteEnd) break;
1243 ch = (char)(lastByte << 8 | *(bytes++));
1247 ch = (char)(*(bytes++) << 8 | lastByte);
1251 // See if the char's valid
1252 if (ch >= 0xd800 && ch <= 0xdfff)
1254 // Was it a high surrogate?
1257 // Its a high surrogate, if we had one then do fallback for previous one
1260 // Ignore previous bad high surrogate
1263 // Get fallback for previous high surrogate
1264 // Note we have to reconstruct bytes because some may have been in decoder
1265 byte[] byteBuffer = null;
1268 byteBuffer = new byte[]
1269 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1273 byteBuffer = new byte[]
1274 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1277 if (fallbackBuffer == null)
1279 if (decoder == null)
1280 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1282 fallbackBuffer = decoder.FallbackBuffer;
1284 // Set our internal fallback interesting things.
1285 fallbackBuffer.InternalInitialize(byteStart, null);
1289 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1292 // Ignore the last one which fell back already,
1293 // and remember the new high surrogate
1298 // Its a low surrogate
1301 // Expected a previous high surrogate
1304 // Get fallback for this low surrogate
1305 // Note we have to reconstruct bytes because some may have been in decoder
1306 byte[] byteBuffer = null;
1309 byteBuffer = new byte[]
1310 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1314 byteBuffer = new byte[]
1315 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1318 if (fallbackBuffer == null)
1320 if (decoder == null)
1321 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1323 fallbackBuffer = decoder.FallbackBuffer;
1325 // Set our internal fallback interesting things.
1326 fallbackBuffer.InternalInitialize(byteStart, null);
1329 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1331 // Ignore this one (we already did its fallback)
1335 // Valid surrogate pair, already counted.
1338 else if (lastChar > 0)
1340 // Had a high surrogate, expected a low surrogate
1341 // Uncount the last high surrogate
1344 // fall back the high surrogate.
1345 byte[] byteBuffer = null;
1348 byteBuffer = new byte[]
1349 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1353 byteBuffer = new byte[]
1354 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1357 if (fallbackBuffer == null)
1359 if (decoder == null)
1360 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1362 fallbackBuffer = decoder.FallbackBuffer;
1364 // Set our internal fallback interesting things.
1365 fallbackBuffer.InternalInitialize(byteStart, null);
1368 // Already subtracted high surrogate
1369 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1371 // Not left over now, clear previous high surrogate and continue to add current char
1375 // Valid char, already counted
1378 // Extra space if we can't use decoder
1379 if (decoder == null || decoder.MustFlush)
1383 // No hanging high surrogates allowed, do fallback and remove count for it
1385 byte[] byteBuffer = null;
1388 byteBuffer = new byte[]
1389 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1393 byteBuffer = new byte[]
1394 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1397 if (fallbackBuffer == null)
1399 if (decoder == null)
1400 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1402 fallbackBuffer = decoder.FallbackBuffer;
1404 // Set our internal fallback interesting things.
1405 fallbackBuffer.InternalInitialize(byteStart, null);
1408 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1415 if (fallbackBuffer == null)
1417 if (decoder == null)
1418 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1420 fallbackBuffer = decoder.FallbackBuffer;
1422 // Set our internal fallback interesting things.
1423 fallbackBuffer.InternalInitialize(byteStart, null);
1426 // No hanging odd bytes allowed if must flush
1427 charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes);
1432 // If we had a high surrogate left over, we can't count it
1436 // Shouldn't have anything in fallback buffer for GetCharCount
1437 // (don't have to check m_throwOnOverflow for count)
1438 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1439 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
1444 internal override unsafe int GetChars(byte* bytes, int byteCount,
1445 char* chars, int charCount, DecoderNLS baseDecoder)
1447 Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null");
1448 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
1449 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0");
1450 Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null");
1452 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
1456 char lastChar = (char)0;
1458 // Get our decoder (but don't clear it yet)
1459 if (decoder != null)
1461 lastByte = decoder.lastByte;
1462 lastChar = decoder.lastChar;
1464 // Shouldn't have anything in fallback buffer for GetChars
1465 // (don't have to check m_throwOnOverflow for chars)
1466 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1467 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
1470 // For fallback we may need a fallback buffer
1471 DecoderFallbackBuffer fallbackBuffer = null;
1472 char* charsForFallback;
1474 byte* byteEnd = bytes + byteCount;
1475 char* charEnd = chars + charCount;
1476 byte* byteStart = bytes;
1477 char* charStart = chars;
1479 while (bytes < byteEnd)
1481 // If we're aligned then maybe we can do it fast
1482 // This'll hurt if we're unaligned because we'll always test but never be aligned
1483 #if !NO_FAST_UNICODE_LOOP
1489 #if BIT64 // win64 has to be long aligned
1490 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
1492 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
1494 lastByte == -1 && lastChar == 0)
1496 // Need -1 to check 2 at a time. If we have an even #, longChars will go
1497 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
1498 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1499 // We can only go iCount units (limited by shorter of char or byte buffers.
1500 ulong* longEnd = (ulong*)(bytes - 7 +
1501 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
1502 (byteEnd - bytes) : (charEnd - chars) << 1));
1504 // Need new char* so we can check 4 at a time
1505 ulong* longBytes = (ulong*)bytes;
1506 ulong* longChars = (ulong*)chars;
1508 while (longBytes < longEnd)
1510 // See if we potentially have surrogates (0x8000 bit set)
1511 // (We're either big endian on a big endian machine or little endian on
1512 // a little endian machine so this'll work)
1513 if ((0x8000800080008000 & *longBytes) != 0)
1515 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1516 // 5 bits looks like 11011, then its a high or low surrogate.
1517 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1518 // Note that we expect BMP characters to be more common than surrogates
1519 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1520 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1522 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1523 // but no clue if they're high or low.
1524 // If each of the 4 characters are non-zero, then none are surrogates.
1525 if ((uTemp & 0xFFFF000000000000) == 0 ||
1526 (uTemp & 0x0000FFFF00000000) == 0 ||
1527 (uTemp & 0x00000000FFFF0000) == 0 ||
1528 (uTemp & 0x000000000000FFFF) == 0)
1530 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1531 // or if there's 1 or 4 surrogates
1533 // If they happen to be high/low/high/low, we may as well continue. Check the next
1534 // bit to see if its set (low) or not (high) in the right pattern
1536 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
1538 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
1541 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1542 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1544 // Drop out to the slow loop to resolve the surrogates
1547 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1549 // else none are surrogates, so we can use them.
1551 // else all < 0x8000 so we can use them
1553 // We can use these 4 chars.
1554 *longChars = *longBytes;
1559 chars = (char*)longChars;
1560 bytes = (byte*)longBytes;
1562 if (bytes >= byteEnd)
1565 #endif // !NO_FAST_UNICODE_LOOP
1570 lastByte = *bytes++;
1578 ch = (char)(lastByte << 8 | *(bytes++));
1582 ch = (char)(*(bytes++) << 8 | lastByte);
1586 // See if the char's valid
1587 if (ch >= 0xd800 && ch <= 0xdfff)
1589 // Was it a high surrogate?
1592 // Its a high surrogate, if we had one then do fallback for previous one
1595 // Get fallback for previous high surrogate
1596 // Note we have to reconstruct bytes because some may have been in decoder
1597 byte[] byteBuffer = null;
1600 byteBuffer = new byte[]
1601 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1605 byteBuffer = new byte[]
1606 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1609 if (fallbackBuffer == null)
1611 if (decoder == null)
1612 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1614 fallbackBuffer = decoder.FallbackBuffer;
1616 // Set our internal fallback interesting things.
1617 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1620 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1621 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1622 chars = charsForFallback;
1624 if (!fallbackResult)
1626 // couldn't fall back lonely surrogate
1627 // We either advanced bytes or chars should == charStart and throw below
1628 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1629 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
1630 bytes -= 2; // didn't use these 2 bytes
1631 fallbackBuffer.InternalReset();
1632 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1633 break; // couldn't fallback but didn't throw
1637 // Ignore the previous high surrogate which fell back already,
1638 // yet remember the current high surrogate for next time.
1643 // Its a low surrogate
1646 // Expected a previous high surrogate
1647 // Get fallback for this low surrogate
1648 // Note we have to reconstruct bytes because some may have been in decoder
1649 byte[] byteBuffer = null;
1652 byteBuffer = new byte[]
1653 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1657 byteBuffer = new byte[]
1658 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1661 if (fallbackBuffer == null)
1663 if (decoder == null)
1664 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1666 fallbackBuffer = decoder.FallbackBuffer;
1668 // Set our internal fallback interesting things.
1669 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1672 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1673 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1674 chars = charsForFallback;
1676 if (!fallbackResult)
1678 // couldn't fall back lonely surrogate
1679 // We either advanced bytes or chars should == charStart and throw below
1680 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1681 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
1682 bytes -= 2; // didn't use these 2 bytes
1683 fallbackBuffer.InternalReset();
1684 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1685 break; // couldn't fallback but didn't throw
1688 // Didn't throw, ignore this one (we already did its fallback)
1692 // Valid surrogate pair, add our lastChar (will need 2 chars)
1693 if (chars >= charEnd - 1)
1695 // couldn't find room for this surrogate pair
1696 // We either advanced bytes or chars should == charStart and throw below
1697 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1698 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
1699 bytes -= 2; // didn't use these 2 bytes
1700 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1701 // Leave lastChar for next call to Convert()
1702 break; // couldn't fallback but didn't throw
1705 *chars++ = lastChar;
1708 else if (lastChar > 0)
1710 // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
1711 byte[] byteBuffer = null;
1714 byteBuffer = new byte[]
1715 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1719 byteBuffer = new byte[]
1720 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1723 if (fallbackBuffer == null)
1725 if (decoder == null)
1726 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1728 fallbackBuffer = decoder.FallbackBuffer;
1730 // Set our internal fallback interesting things.
1731 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1734 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1735 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1736 chars = charsForFallback;
1738 if (!fallbackResult)
1740 // couldn't fall back high surrogate, or char that would be next
1741 // We either advanced bytes or chars should == charStart and throw below
1742 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1743 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
1744 bytes -= 2; // didn't use these 2 bytes
1745 fallbackBuffer.InternalReset();
1746 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1747 break; // couldn't fallback but didn't throw
1750 // Not left over now, clear previous high surrogate and continue to add current char
1754 // Valid char, room for it?
1755 if (chars >= charEnd)
1757 // 2 bytes couldn't fall back
1758 // We either advanced bytes or chars should == charStart and throw below
1759 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1760 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
1761 bytes -= 2; // didn't use these bytes
1762 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1763 break; // couldn't fallback but didn't throw
1770 // Remember our decoder if we must
1771 if (decoder == null || decoder.MustFlush)
1775 // No hanging high surrogates allowed, do fallback and remove count for it
1776 byte[] byteBuffer = null;
1779 byteBuffer = new byte[]
1780 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1784 byteBuffer = new byte[]
1785 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1788 if (fallbackBuffer == null)
1790 if (decoder == null)
1791 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1793 fallbackBuffer = decoder.FallbackBuffer;
1795 // Set our internal fallback interesting things.
1796 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1799 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1800 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1801 chars = charsForFallback;
1803 if (!fallbackResult)
1805 // 2 bytes couldn't fall back
1806 // We either advanced bytes or chars should == charStart and throw below
1807 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1808 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
1809 bytes -= 2; // didn't use these bytes
1811 bytes--; // had an extra last byte hanging around
1812 fallbackBuffer.InternalReset();
1813 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1814 // We'll remember these in our decoder though
1821 // done with this one
1827 if (fallbackBuffer == null)
1829 if (decoder == null)
1830 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1832 fallbackBuffer = decoder.FallbackBuffer;
1834 // Set our internal fallback interesting things.
1835 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1838 // No hanging odd bytes allowed if must flush
1839 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
1840 bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback);
1841 chars = charsForFallback;
1843 if (!fallbackResult)
1845 // odd byte couldn't fall back
1846 bytes--; // didn't use this byte
1847 fallbackBuffer.InternalReset();
1848 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1849 // didn't throw, but we'll remember it in the decoder
1854 // Didn't fail, clear buffer
1861 // Remember our decoder if we must
1862 if (decoder != null)
1864 Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)),
1865 "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
1866 // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
1869 decoder.m_bytesUsed = (int)(bytes - byteStart);
1870 decoder.lastChar = lastChar;
1871 decoder.lastByte = lastByte;
1874 // Used to do this the old way
1875 // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount);
1877 // Shouldn't have anything in fallback buffer for GetChars
1878 // (don't have to check m_throwOnOverflow for count or chars)
1879 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1880 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
1882 return (int)(chars - charStart);
1886 public override System.Text.Encoder GetEncoder()
1888 return new EncoderNLS(this);
1892 public override System.Text.Decoder GetDecoder()
1894 return new UnicodeEncoding.Decoder(this);
1898 public override byte[] GetPreamble()
1902 // Note - we must allocate new byte[]'s here to prevent someone
1903 // from modifying a cached byte[].
1905 return new byte[2] { 0xfe, 0xff };
1907 return new byte[2] { 0xff, 0xfe };
1909 return Array.Empty<Byte>();
1913 public override int GetMaxByteCount(int charCount)
1916 throw new ArgumentOutOfRangeException(nameof(charCount),
1917 SR.ArgumentOutOfRange_NeedNonNegNum);
1918 Contract.EndContractBlock();
1920 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
1921 long byteCount = (long)charCount + 1;
1923 if (EncoderFallback.MaxCharCount > 1)
1924 byteCount *= EncoderFallback.MaxCharCount;
1929 if (byteCount > 0x7fffffff)
1930 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
1932 return (int)byteCount;
1936 public override int GetMaxCharCount(int byteCount)
1939 throw new ArgumentOutOfRangeException(nameof(byteCount),
1940 SR.ArgumentOutOfRange_NeedNonNegNum);
1941 Contract.EndContractBlock();
1943 // long because byteCount could be biggest int.
1944 // 1 char per 2 bytes. Round up in case 1 left over in decoder.
1945 // Round up using &1 in case byteCount is max size
1946 // Might also need an extra 1 if there's a left over high surrogate in the decoder.
1947 long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1;
1949 // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizzare like that)
1950 if (DecoderFallback.MaxCharCount > 1)
1951 charCount *= DecoderFallback.MaxCharCount;
1953 if (charCount > 0x7fffffff)
1954 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
1956 return (int)charCount;
1960 public override bool Equals(Object value)
1962 UnicodeEncoding that = value as UnicodeEncoding;
1966 // Big Endian Unicode has different code page (1201) than small Endian one (1200),
1967 // so we still have to check m_codePage here.
1969 return (CodePage == that.CodePage) &&
1970 byteOrderMark == that.byteOrderMark &&
1971 // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
1972 bigEndian == that.bigEndian &&
1973 (EncoderFallback.Equals(that.EncoderFallback)) &&
1974 (DecoderFallback.Equals(that.DecoderFallback));
1979 public override int GetHashCode()
1981 return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
1982 (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0);
1985 private sealed class Decoder : System.Text.DecoderNLS, ISerializable
1987 internal int lastByte = -1;
1988 internal char lastChar = '\0';
1990 public Decoder(UnicodeEncoding encoding) : base(encoding)
1995 internal Decoder(SerializationInfo info, StreamingContext context)
1997 throw new PlatformNotSupportedException();
2000 // ISerializable implementation
2001 void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2003 throw new PlatformNotSupportedException();
2006 public override void Reset()
2010 if (m_fallbackBuffer != null)
2011 m_fallbackBuffer.Reset();
2014 // Anything left in our decoder?
2015 internal override bool HasState
2019 return (this.lastByte != -1 || this.lastChar != '\0');