From 9673552bb036470ebb57aa647d0995686cbd94c8 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Wed, 5 Jul 2017 17:03:41 -0700 Subject: [PATCH] Merge pull request dotnet/corert#4080 from dotnet/nmirror Merge nmirror to master Signed-off-by: dotnet-bot --- .../shared/System.Private.CoreLib.Shared.projitems | 3 + src/mscorlib/shared/System/Text/Encoding.cs | 1925 ++++++++++++++++++++ src/mscorlib/shared/System/Text/Latin1Encoding.cs | 894 +++++++++ src/mscorlib/shared/System/Text/UTF7Encoding.cs | 1041 +++++++++++ 4 files changed, 3863 insertions(+) create mode 100644 src/mscorlib/shared/System/Text/Encoding.cs create mode 100644 src/mscorlib/shared/System/Text/Latin1Encoding.cs create mode 100644 src/mscorlib/shared/System/Text/UTF7Encoding.cs diff --git a/src/mscorlib/shared/System.Private.CoreLib.Shared.projitems b/src/mscorlib/shared/System.Private.CoreLib.Shared.projitems index 566854b..4610540 100644 --- a/src/mscorlib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/mscorlib/shared/System.Private.CoreLib.Shared.projitems @@ -366,13 +366,16 @@ + + + diff --git a/src/mscorlib/shared/System/Text/Encoding.cs b/src/mscorlib/shared/System/Text/Encoding.cs new file mode 100644 index 0000000..9e89514 --- /dev/null +++ b/src/mscorlib/shared/System/Text/Encoding.cs @@ -0,0 +1,1925 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Threading; +using System.Runtime.Serialization; +using System.Diagnostics.CodeAnalysis; + +namespace System.Text +{ + // This abstract base class represents a character encoding. The class provides + // methods to convert arrays and strings of Unicode characters to and from + // arrays of bytes. A number of Encoding implementations are provided in + // the System.Text package, including: + // + // ASCIIEncoding, which encodes Unicode characters as single 7-bit + // ASCII characters. This encoding only supports character values between 0x00 + // and 0x7F. + // BaseCodePageEncoding, which encapsulates a Windows code page. Any + // installed code page can be accessed through this encoding, and conversions + // are performed using the WideCharToMultiByte and + // MultiByteToWideChar Windows API functions. + // UnicodeEncoding, which encodes each Unicode character as two + // consecutive bytes. Both little-endian (code page 1200) and big-endian (code + // page 1201) encodings are recognized. + // UTF7Encoding, which encodes Unicode characters using the UTF-7 + // encoding (UTF-7 stands for UCS Transformation Format, 7-bit form). This + // encoding supports all Unicode character values, and can also be accessed + // as code page 65000. + // UTF8Encoding, which encodes Unicode characters using the UTF-8 + // encoding (UTF-8 stands for UCS Transformation Format, 8-bit form). This + // encoding supports all Unicode character values, and can also be accessed + // as code page 65001. + // UTF32Encoding, both 12000 (little endian) & 12001 (big endian) + // + // In addition to directly instantiating Encoding objects, an + // application can use the ForCodePage, GetASCII, + // GetDefault, GetUnicode, GetUTF7, and GetUTF8 + // methods in this class to obtain encodings. + // + // Through an encoding, the GetBytes method is used to convert arrays + // of characters to arrays of bytes, and the GetChars method is used to + // convert arrays of bytes to arrays of characters. The GetBytes and + // GetChars methods maintain no state between conversions, and are + // generally intended for conversions of complete blocks of bytes and + // characters in one operation. When the data to be converted is only available + // in sequential blocks (such as data read from a stream) or when the amount of + // data is so large that it needs to be divided into smaller blocks, an + // application may choose to use a Decoder or an Encoder to + // perform the conversion. Decoders and encoders allow sequential blocks of + // data to be converted and they maintain the state required to support + // conversions of data that spans adjacent blocks. Decoders and encoders are + // obtained using the GetDecoder and GetEncoder methods. + // + // The core GetBytes and GetChars methods require the caller + // to provide the destination buffer and ensure that the buffer is large enough + // to hold the entire result of the conversion. When using these methods, + // either directly on an Encoding object or on an associated + // Decoder or Encoder, an application can use one of two methods + // to allocate destination buffers. + // + // The GetByteCount and GetCharCount methods can be used to + // compute the exact size of the result of a particular conversion, and an + // appropriately sized buffer for that conversion can then be allocated. + // The GetMaxByteCount and GetMaxCharCount methods can be + // be used to compute the maximum possible size of a conversion of a given + // number of bytes or characters, and a buffer of that size can then be reused + // for multiple conversions. + // + // The first method generally uses less memory, whereas the second method + // generally executes faster. + // + + public abstract class Encoding : ICloneable + { + // For netcore we use UTF8 as default encoding since ANSI isn't available + private static readonly UTF8Encoding.UTF8EncodingSealed s_defaultEncoding = new UTF8Encoding.UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: false); + + // Returns an encoding for the system's current ANSI code page. + public static Encoding Default => s_defaultEncoding; + + // + // The following values are from mlang.idl. These values + // should be in sync with those in mlang.idl. + // + internal const int MIMECONTF_MAILNEWS = 0x00000001; + internal const int MIMECONTF_BROWSER = 0x00000002; + internal const int MIMECONTF_SAVABLE_MAILNEWS = 0x00000100; + internal const int MIMECONTF_SAVABLE_BROWSER = 0x00000200; + + // Special Case Code Pages + private const int CodePageDefault = 0; + private const int CodePageNoOEM = 1; // OEM Code page not supported + private const int CodePageNoMac = 2; // MAC code page not supported + private const int CodePageNoThread = 3; // Thread code page not supported + private const int CodePageNoSymbol = 42; // Symbol code page not supported + private const int CodePageUnicode = 1200; // Unicode + private const int CodePageBigEndian = 1201; // Big Endian Unicode + private const int CodePageWindows1252 = 1252; // Windows 1252 code page + + // 20936 has same code page as 10008, so we'll special case it + private const int CodePageMacGB2312 = 10008; + private const int CodePageGB2312 = 20936; + private const int CodePageMacKorean = 10003; + private const int CodePageDLLKorean = 20949; + + // ISO 2022 Code Pages + private const int ISO2022JP = 50220; + private const int ISO2022JPESC = 50221; + private const int ISO2022JPSISO = 50222; + private const int ISOKorean = 50225; + private const int ISOSimplifiedCN = 50227; + private const int EUCJP = 51932; + private const int ChineseHZ = 52936; // HZ has ~}~{~~ sequences + + // 51936 is the same as 936 + private const int DuplicateEUCCN = 51936; + private const int EUCCN = 936; + + private const int EUCKR = 51949; + + // Latin 1 & ASCII Code Pages + internal const int CodePageASCII = 20127; // ASCII + internal const int ISO_8859_1 = 28591; // Latin1 + + // ISCII + private const int ISCIIAssemese = 57006; + private const int ISCIIBengali = 57003; + private const int ISCIIDevanagari = 57002; + private const int ISCIIGujarathi = 57010; + private const int ISCIIKannada = 57008; + private const int ISCIIMalayalam = 57009; + private const int ISCIIOriya = 57007; + private const int ISCIIPanjabi = 57011; + private const int ISCIITamil = 57004; + private const int ISCIITelugu = 57005; + + // GB18030 + private const int GB18030 = 54936; + + // Other + private const int ISO_8859_8I = 38598; + private const int ISO_8859_8_Visual = 28598; + + // 50229 is currently unsupported // "Chinese Traditional (ISO-2022)" + private const int ENC50229 = 50229; + + // Special code pages + private const int CodePageUTF7 = 65000; + private const int CodePageUTF8 = 65001; + private const int CodePageUTF32 = 12000; + private const int CodePageUTF32BE = 12001; + + internal int m_codePage = 0; + + // dataItem should be internal (not private). otherwise it will break during the deserialization + // of the data came from Everett + internal CodePageDataItem dataItem = null; + + [NonSerialized] + internal bool m_deserializedFromEverett = false; + + // Because of encoders we may be read only + [OptionalField(VersionAdded = 2)] + private bool m_isReadOnly = true; + + // Encoding (encoder) fallback + [OptionalField(VersionAdded = 2)] + internal EncoderFallback encoderFallback = null; + [OptionalField(VersionAdded = 2)] + internal DecoderFallback decoderFallback = null; + + protected Encoding() : this(0) + { + } + + + protected Encoding(int codePage) + { + // Validate code page + if (codePage < 0) + { + throw new ArgumentOutOfRangeException(nameof(codePage)); + } + Contract.EndContractBlock(); + + // Remember code page + m_codePage = codePage; + + // Use default encoder/decoder fallbacks + this.SetDefaultFallbacks(); + } + + // This constructor is needed to allow any sub-classing implementation to provide encoder/decoder fallback objects + // because the encoding object is always created as read-only object and don't allow setting encoder/decoder fallback + // after the creation is done. + protected Encoding(int codePage, EncoderFallback encoderFallback, DecoderFallback decoderFallback) + { + // Validate code page + if (codePage < 0) + { + throw new ArgumentOutOfRangeException(nameof(codePage)); + } + Contract.EndContractBlock(); + + // Remember code page + m_codePage = codePage; + + this.encoderFallback = encoderFallback ?? new InternalEncoderBestFitFallback(this); + this.decoderFallback = decoderFallback ?? new InternalDecoderBestFitFallback(this); + } + + // Default fallback that we'll use. + internal virtual void SetDefaultFallbacks() + { + // For UTF-X encodings, we use a replacement fallback with an "\xFFFD" string, + // For ASCII we use "?" replacement fallback, etc. + this.encoderFallback = new InternalEncoderBestFitFallback(this); + this.decoderFallback = new InternalDecoderBestFitFallback(this); + } + + + #region Serialization + internal void OnDeserializing() + { + // intialize the optional Whidbey fields + encoderFallback = null; + decoderFallback = null; + m_isReadOnly = true; + } + + internal void OnDeserialized() + { + if (encoderFallback == null || decoderFallback == null) + { + m_deserializedFromEverett = true; + SetDefaultFallbacks(); + } + + // dataItem is always recalculated from the code page # + dataItem = null; + } + + [OnDeserializing] + private void OnDeserializing(StreamingContext ctx) + { + OnDeserializing(); + } + + + [OnDeserialized] + private void OnDeserialized(StreamingContext ctx) + { + OnDeserialized(); + } + + [OnSerializing] + private void OnSerializing(StreamingContext ctx) + { + // to be consistent with SerializeEncoding + dataItem = null; + } + + // the following two methods are used for the inherited classes which implemented ISerializable + // Deserialization Helper + internal void DeserializeEncoding(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // All versions have a code page + this.m_codePage = (int)info.GetValue("m_codePage", typeof(int)); + + // We can get dataItem on the fly if needed, and the index is different between versions + // so ignore whatever dataItem data we get from Everett. + this.dataItem = null; + + // See if we have a code page + try + { + // + // Try Whidbey V2.0 Fields + // + + m_isReadOnly = (bool)info.GetValue("m_isReadOnly", typeof(bool)); + + this.encoderFallback = (EncoderFallback)info.GetValue("encoderFallback", typeof(EncoderFallback)); + this.decoderFallback = (DecoderFallback)info.GetValue("decoderFallback", typeof(DecoderFallback)); + } + catch (SerializationException) + { + // + // Didn't have Whidbey things, must be Everett + // + this.m_deserializedFromEverett = true; + + // May as well be read only + m_isReadOnly = true; + SetDefaultFallbacks(); + } + } + + // Serialization Helper + internal void SerializeEncoding(SerializationInfo info, StreamingContext context) + { + // Any Info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // These are new V2.0 Whidbey stuff + info.AddValue("m_isReadOnly", m_isReadOnly); + info.AddValue("encoderFallback", this.EncoderFallback); + info.AddValue("decoderFallback", this.DecoderFallback); + + // These were in Everett V1.1 as well + info.AddValue("m_codePage", this.m_codePage); + + // This was unique to Everett V1.1 + info.AddValue("dataItem", null); + + // Everett duplicated these fields, so these are needed for portability + info.AddValue("Encoding+m_codePage", this.m_codePage); + info.AddValue("Encoding+dataItem", null); + } + + #endregion Serialization + + // Converts a byte array from one encoding to another. The bytes in the + // bytes array are converted from srcEncoding to + // dstEncoding, and the returned value is a new byte array + // containing the result of the conversion. + // + [Pure] + public static byte[] Convert(Encoding srcEncoding, Encoding dstEncoding, + byte[] bytes) + { + if (bytes == null) + throw new ArgumentNullException(nameof(bytes)); + Contract.Ensures(Contract.Result() != null); + + return Convert(srcEncoding, dstEncoding, bytes, 0, bytes.Length); + } + + // Converts a range of bytes in a byte array from one encoding to another. + // This method converts count bytes from bytes starting at + // index index from srcEncoding to dstEncoding, and + // returns a new byte array containing the result of the conversion. + // + [Pure] + public static byte[] Convert(Encoding srcEncoding, Encoding dstEncoding, + byte[] bytes, int index, int count) + { + if (srcEncoding == null || dstEncoding == null) + { + throw new ArgumentNullException((srcEncoding == null ? nameof(srcEncoding) : nameof(dstEncoding)), + SR.ArgumentNull_Array); + } + if (bytes == null) + { + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + } + Contract.Ensures(Contract.Result() != null); + + return dstEncoding.GetBytes(srcEncoding.GetChars(bytes, index, count)); + } + + public static void RegisterProvider(EncodingProvider provider) + { + // Parameters validated inside EncodingProvider + EncodingProvider.AddProvider(provider); + } + + [Pure] + public static Encoding GetEncoding(int codepage) + { + Encoding result = EncodingProvider.GetEncodingFromProvider(codepage); + if (result != null) + return result; + + // + // NOTE: If you add a new encoding that can be retrieved by codepage, be sure to + // add the corresponding item in EncodingTable. + // Otherwise, the code below will throw exception when trying to call + // EncodingTable.GetDataItem(). + // + if (codepage < 0 || codepage > 65535) + { + throw new ArgumentOutOfRangeException( + nameof(codepage), SR.Format(SR.ArgumentOutOfRange_Range, 0, 65535)); + } + + Contract.EndContractBlock(); + + switch (codepage) + { + case CodePageDefault: return Default; // 0 + case CodePageUnicode: return Unicode; // 1200 + case CodePageBigEndian: return BigEndianUnicode; // 1201 + case CodePageUTF32: return UTF32; // 12000 + case CodePageUTF32BE: return BigEndianUTF32; // 12001 + case CodePageUTF7: return UTF7; // 65000 + case CodePageUTF8: return UTF8; // 65001 + case CodePageASCII: return ASCII; // 20127 + case ISO_8859_1: return Latin1; // 28591 + + // We don't allow the following special code page values that Win32 allows. + case CodePageNoOEM: // 1 CP_OEMCP + case CodePageNoMac: // 2 CP_MACCP + case CodePageNoThread: // 3 CP_THREAD_ACP + case CodePageNoSymbol: // 42 CP_SYMBOL + throw new ArgumentException(SR.Format(SR.Argument_CodepageNotSupported, codepage), nameof(codepage)); + } + + // Is it a valid code page? + if (EncodingTable.GetCodePageDataItem(codepage) == null) + { + throw new NotSupportedException( + SR.Format(SR.NotSupported_NoCodepageData, codepage)); + } + + return UTF8; + } + + [Pure] + public static Encoding GetEncoding(int codepage, + EncoderFallback encoderFallback, DecoderFallback decoderFallback) + { + Encoding baseEncoding = EncodingProvider.GetEncodingFromProvider(codepage, encoderFallback, decoderFallback); + + if (baseEncoding != null) + return baseEncoding; + + // Get the default encoding (which is cached and read only) + baseEncoding = GetEncoding(codepage); + + // Clone it and set the fallback + Encoding fallbackEncoding = (Encoding)baseEncoding.Clone(); + fallbackEncoding.EncoderFallback = encoderFallback; + fallbackEncoding.DecoderFallback = decoderFallback; + + return fallbackEncoding; + } + + // Returns an Encoding object for a given name or a given code page value. + // + [Pure] + public static Encoding GetEncoding(String name) + { + Encoding baseEncoding = EncodingProvider.GetEncodingFromProvider(name); + if (baseEncoding != null) + return baseEncoding; + + // + // NOTE: If you add a new encoding that can be requested by name, be sure to + // add the corresponding item in EncodingTable. + // Otherwise, the code below will throw exception when trying to call + // EncodingTable.GetCodePageFromName(). + // + return GetEncoding(EncodingTable.GetCodePageFromName(name)); + } + + // Returns an Encoding object for a given name or a given code page value. + // + [Pure] + public static Encoding GetEncoding(String name, + EncoderFallback encoderFallback, DecoderFallback decoderFallback) + { + Encoding baseEncoding = EncodingProvider.GetEncodingFromProvider(name, encoderFallback, decoderFallback); + if (baseEncoding != null) + return baseEncoding; + + // + // NOTE: If you add a new encoding that can be requested by name, be sure to + // add the corresponding item in EncodingTable. + // Otherwise, the code below will throw exception when trying to call + // EncodingTable.GetCodePageFromName(). + // + return (GetEncoding(EncodingTable.GetCodePageFromName(name), encoderFallback, decoderFallback)); + } + + // Return a list of all EncodingInfo objects describing all of our encodings + [Pure] + public static EncodingInfo[] GetEncodings() + { + return EncodingTable.GetEncodings(); + } + + [Pure] + public virtual byte[] GetPreamble() + { + return Array.Empty(); + } + + private void GetDataItem() + { + if (dataItem == null) + { + dataItem = EncodingTable.GetCodePageDataItem(m_codePage); + if (dataItem == null) + { + throw new NotSupportedException( + SR.Format(SR.NotSupported_NoCodepageData, m_codePage)); + } + } + } + + // Returns the name for this encoding that can be used with mail agent body tags. + // If the encoding may not be used, the string is empty. + + public virtual String BodyName + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return (dataItem.BodyName); + } + } + + // Returns the human-readable description of the encoding ( e.g. Hebrew (DOS)). +#if PROJECTN + public virtual String EncodingName + { + get + { + string encodingName = GetLocalizedEncodingNameResource(this.CodePage); + if (encodingName == null) + { + throw new NotSupportedException(SR.Format(SR.MissingEncodingNameResource, this.CodePage)); + } + + if (encodingName.StartsWith("Globalization_cp_", StringComparison.Ordinal)) + { + // On ProjectN, resource strings are stripped from retail builds and replaced by + // their identifier names. Since this property is meant to be a localized string, + // but we don't localize ProjectN, we specifically need to do something reasonable + // in this case. This currently returns the English name of the encoding from a + // static data table. + encodingName = EncodingTable.GetCodePageDataItem(this.CodePage).EnglishName; + if (encodingName == null) + { + throw new NotSupportedException(SR.Format(SR.MissingEncodingNameResource, this.WebName, this.CodePage)); + } + } + return encodingName; + } + } + + private static string GetLocalizedEncodingNameResource(int codePage) + { + switch (codePage) + { + case 1200: return SR.Globalization_cp_1200; + case 1201: return SR.Globalization_cp_1201; + case 12000: return SR.Globalization_cp_12000; + case 12001: return SR.Globalization_cp_12001; + case 20127: return SR.Globalization_cp_20127; + case 28591: return SR.Globalization_cp_28591; + case 65000: return SR.Globalization_cp_65000; + case 65001: return SR.Globalization_cp_65001; + default: return null; + } + } +#else + public virtual String EncodingName + { + get + { + return SR.GetResourceString("Globalization_cp_" + m_codePage.ToString()); + } + } +#endif + // Returns the name for this encoding that can be used with mail agent header + // tags. If the encoding may not be used, the string is empty. + + public virtual String HeaderName + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return (dataItem.HeaderName); + } + } + + // Returns the IANA preferred name for this encoding. + public virtual String WebName + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return (dataItem.WebName); + } + } + + // Returns the windows code page that most closely corresponds to this encoding. + + public virtual int WindowsCodePage + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return (dataItem.UIFamilyCodePage); + } + } + + + // True if and only if the encoding is used for display by browsers clients. + + public virtual bool IsBrowserDisplay + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return ((dataItem.Flags & MIMECONTF_BROWSER) != 0); + } + } + + // True if and only if the encoding is used for saving by browsers clients. + + public virtual bool IsBrowserSave + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return ((dataItem.Flags & MIMECONTF_SAVABLE_BROWSER) != 0); + } + } + + // True if and only if the encoding is used for display by mail and news clients. + + public virtual bool IsMailNewsDisplay + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return ((dataItem.Flags & MIMECONTF_MAILNEWS) != 0); + } + } + + + // True if and only if the encoding is used for saving documents by mail and + // news clients + + public virtual bool IsMailNewsSave + { + get + { + if (dataItem == null) + { + GetDataItem(); + } + return ((dataItem.Flags & MIMECONTF_SAVABLE_MAILNEWS) != 0); + } + } + + // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) + + public virtual bool IsSingleByte + { + get + { + return false; + } + } + + + public EncoderFallback EncoderFallback + { + get + { + return encoderFallback; + } + + set + { + if (this.IsReadOnly) + throw new InvalidOperationException(SR.InvalidOperation_ReadOnly); + + if (value == null) + throw new ArgumentNullException(nameof(value)); + Contract.EndContractBlock(); + + encoderFallback = value; + } + } + + + public DecoderFallback DecoderFallback + { + get + { + return decoderFallback; + } + + set + { + if (this.IsReadOnly) + throw new InvalidOperationException(SR.InvalidOperation_ReadOnly); + + if (value == null) + throw new ArgumentNullException(nameof(value)); + Contract.EndContractBlock(); + + decoderFallback = value; + } + } + + + public virtual Object Clone() + { + Encoding newEncoding = (Encoding)this.MemberwiseClone(); + + // New one should be readable + newEncoding.m_isReadOnly = false; + return newEncoding; + } + + + public bool IsReadOnly + { + get + { + return (m_isReadOnly); + } + } + + // Returns an encoding for the ASCII character set. The returned encoding + // will be an instance of the ASCIIEncoding class. + + public static Encoding ASCII => ASCIIEncoding.s_default; + + // Returns an encoding for the Latin1 character set. The returned encoding + // will be an instance of the Latin1Encoding class. + // + // This is for our optimizations + private static Encoding Latin1 => Latin1Encoding.s_default; + + // Returns the number of bytes required to encode the given character + // array. + // + [Pure] + public virtual int GetByteCount(char[] chars) + { + if (chars == null) + { + throw new ArgumentNullException(nameof(chars), + SR.ArgumentNull_Array); + } + Contract.EndContractBlock(); + + return GetByteCount(chars, 0, chars.Length); + } + + [Pure] + public virtual int GetByteCount(String s) + { + if (s == null) + throw new ArgumentNullException(nameof(s)); + Contract.EndContractBlock(); + + char[] chars = s.ToCharArray(); + return GetByteCount(chars, 0, chars.Length); + } + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + [Pure] + public abstract int GetByteCount(char[] chars, int index, int count); + + // Returns the number of bytes required to encode a string range. + // + [Pure] + public int GetByteCount(string s, int index, int count) + { + if (s == null) + throw new ArgumentNullException(nameof(s), + SR.ArgumentNull_String); + if (index < 0) + throw new ArgumentOutOfRangeException(nameof(index), + SR.ArgumentOutOfRange_NeedNonNegNum); + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + if (index > s.Length - count) + throw new ArgumentOutOfRangeException(nameof(index), + SR.ArgumentOutOfRange_IndexCount); + Contract.EndContractBlock(); + + unsafe + { + fixed (char* pChar = s) + { + return GetByteCount(pChar + index, count); + } + } + } + + // We expect this to be the workhorse for NLS encodings + // unfortunately for existing overrides, it has to call the [] version, + // which is really slow, so this method should be avoided if you're calling + // a 3rd party encoding. + [Pure] + [CLSCompliant(false)] + public virtual unsafe int GetByteCount(char* chars, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException(nameof(chars), + SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + char[] arrChar = new char[count]; + int index; + + for (index = 0; index < count; index++) + arrChar[index] = chars[index]; + + return GetByteCount(arrChar, 0, count); + } + + // For NLS Encodings, workhorse takes an encoder (may be null) + // Always validate parameters before calling internal version, which will only assert. + internal virtual unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) + { + Debug.Assert(chars != null); + Debug.Assert(count >= 0); + + return GetByteCount(chars, count); + } + + // Returns a byte array containing the encoded representation of the given + // character array. + // + [Pure] + public virtual byte[] GetBytes(char[] chars) + { + if (chars == null) + { + throw new ArgumentNullException(nameof(chars), + SR.ArgumentNull_Array); + } + Contract.EndContractBlock(); + return GetBytes(chars, 0, chars.Length); + } + + // Returns a byte array containing the encoded representation of a range + // of characters in a character array. + // + [Pure] + public virtual byte[] GetBytes(char[] chars, int index, int count) + { + byte[] result = new byte[GetByteCount(chars, index, count)]; + GetBytes(chars, index, count, result, 0); + return result; + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + public abstract int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex); + + // Returns a byte array containing the encoded representation of the given + // string. + // + [Pure] + public virtual byte[] GetBytes(String s) + { + if (s == null) + throw new ArgumentNullException(nameof(s), + SR.ArgumentNull_String); + Contract.EndContractBlock(); + + int byteCount = GetByteCount(s); + byte[] bytes = new byte[byteCount]; + int bytesReceived = GetBytes(s, 0, s.Length, bytes, 0); + Debug.Assert(byteCount == bytesReceived); + return bytes; + } + + // Returns a byte array containing the encoded representation of the given + // string range. + // + [Pure] + public byte[] GetBytes(string s, int index, int count) + { + if (s == null) + throw new ArgumentNullException(nameof(s), + SR.ArgumentNull_String); + if (index < 0) + throw new ArgumentOutOfRangeException(nameof(index), + SR.ArgumentOutOfRange_NeedNonNegNum); + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + if (index > s.Length - count) + throw new ArgumentOutOfRangeException(nameof(index), + SR.ArgumentOutOfRange_IndexCount); + Contract.EndContractBlock(); + + unsafe + { + fixed (char* pChar = s) + { + int byteCount = GetByteCount(pChar + index, count); + if (byteCount == 0) + return Array.Empty(); + + byte[] bytes = new byte[byteCount]; + fixed (byte* pBytes = &bytes[0]) + { + int bytesReceived = GetBytes(pChar + index, count, pBytes, byteCount); + Debug.Assert(byteCount == bytesReceived); + } + return bytes; + } + } + } + + public virtual int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null) + throw new ArgumentNullException(nameof(s)); + Contract.EndContractBlock(); + return GetBytes(s.ToCharArray(), charIndex, charCount, bytes, byteIndex); + } + + // This is our internal workhorse + // Always validate parameters before calling internal version, which will only assert. + internal virtual unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + return GetBytes(chars, charCount, bytes, byteCount); + } + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + // + // WARNING WARNING WARNING + // + // WARNING: If this breaks it could be a security threat. Obviously we + // call this internally, so you need to make sure that your pointers, counts + // and indexes are correct when you call this method. + // + // In addition, we have internal code, which will be marked as "safe" calling + // this code. However this code is dependent upon the implementation of an + // external GetBytes() method, which could be overridden by a third party and + // the results of which cannot be guaranteed. We use that result to copy + // the byte[] to our byte* output buffer. If the result count was wrong, we + // could easily overflow our output buffer. Therefore we do an extra test + // when we copy the buffer so that we don't overflow byteCount either. + + [CLSCompliant(false)] + public virtual unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount) + { + // Validate input parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), + SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get the char array to convert + char[] arrChar = new char[charCount]; + + int index; + for (index = 0; index < charCount; index++) + arrChar[index] = chars[index]; + + // Get the byte array to fill + byte[] arrByte = new byte[byteCount]; + + // Do the work + int result = GetBytes(arrChar, 0, charCount, arrByte, 0); + + Debug.Assert(result <= byteCount, "[Encoding.GetBytes]Returned more bytes than we have space for"); + + // Copy the byte array + // WARNING: We MUST make sure that we don't copy too many bytes. We can't + // rely on result because it could be a 3rd party implementation. We need + // to make sure we never copy more than byteCount bytes no matter the value + // of result + if (result < byteCount) + byteCount = result; + + // Copy the data, don't overrun our array! + for (index = 0; index < byteCount; index++) + bytes[index] = arrByte[index]; + + return byteCount; + } + + // Returns the number of characters produced by decoding the given byte + // array. + // + [Pure] + public virtual int GetCharCount(byte[] bytes) + { + if (bytes == null) + { + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + } + Contract.EndContractBlock(); + return GetCharCount(bytes, 0, bytes.Length); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + [Pure] + public abstract int GetCharCount(byte[] bytes, int index, int count); + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + [Pure] + [CLSCompliant(false)] + public virtual unsafe int GetCharCount(byte* bytes, int count) + { + // Validate input parameters + if (bytes == null) + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + byte[] arrbyte = new byte[count]; + int index; + + for (index = 0; index < count; index++) + arrbyte[index] = bytes[index]; + + return GetCharCount(arrbyte, 0, count); + } + + // This is our internal workhorse + // Always validate parameters before calling internal version, which will only assert. + internal virtual unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) + { + return GetCharCount(bytes, count); + } + + // Returns a character array containing the decoded representation of a + // given byte array. + // + [Pure] + public virtual char[] GetChars(byte[] bytes) + { + if (bytes == null) + { + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + } + Contract.EndContractBlock(); + return GetChars(bytes, 0, bytes.Length); + } + + // Returns a character array containing the decoded representation of a + // range of bytes in a byte array. + // + [Pure] + public virtual char[] GetChars(byte[] bytes, int index, int count) + { + char[] result = new char[GetCharCount(bytes, index, count)]; + GetChars(bytes, index, count, result, 0); + return result; + } + + // Decodes a range of bytes in a byte array into a range of characters in a + // character array. An exception occurs if the character array is not large + // enough to hold the complete decoding of the bytes. The + // GetCharCount method can be used to determine the exact number of + // characters that will be produced for a given range of bytes. + // Alternatively, the GetMaxCharCount method can be used to + // determine the maximum number of characterss that will be produced for a + // given number of bytes, regardless of the actual byte values. + // + + public abstract int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex); + + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + // + // WARNING WARNING WARNING + // + // WARNING: If this breaks it could be a security threat. Obviously we + // call this internally, so you need to make sure that your pointers, counts + // and indexes are correct when you call this method. + // + // In addition, we have internal code, which will be marked as "safe" calling + // this code. However this code is dependent upon the implementation of an + // external GetChars() method, which could be overridden by a third party and + // the results of which cannot be guaranteed. We use that result to copy + // the char[] to our char* output buffer. If the result count was wrong, we + // could easily overflow our output buffer. Therefore we do an extra test + // when we copy the buffer so that we don't overflow charCount either. + + [CLSCompliant(false)] + public virtual unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount) + { + // Validate input parameters + if (chars == null || bytes == null) + throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), + SR.ArgumentNull_Array); + + if (byteCount < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get the byte array to convert + byte[] arrByte = new byte[byteCount]; + + int index; + for (index = 0; index < byteCount; index++) + arrByte[index] = bytes[index]; + + // Get the char array to fill + char[] arrChar = new char[charCount]; + + // Do the work + int result = GetChars(arrByte, 0, byteCount, arrChar, 0); + + Debug.Assert(result <= charCount, "[Encoding.GetChars]Returned more chars than we have space for"); + + // Copy the char array + // WARNING: We MUST make sure that we don't copy too many chars. We can't + // rely on result because it could be a 3rd party implementation. We need + // to make sure we never copy more than charCount chars no matter the value + // of result + if (result < charCount) + charCount = result; + + // Copy the data, don't overrun our array! + for (index = 0; index < charCount; index++) + chars[index] = arrChar[index]; + + return charCount; + } + + + // This is our internal workhorse + // Always validate parameters before calling internal version, which will only assert. + internal virtual unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS decoder) + { + return GetChars(bytes, byteCount, chars, charCount); + } + + + [CLSCompliant(false)] + public unsafe string GetString(byte* bytes, int byteCount) + { + if (bytes == null) + throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); + + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return String.CreateStringFromEncoding(bytes, byteCount, this); + } + + // Returns the code page identifier of this encoding. The returned value is + // an integer between 0 and 65535 if the encoding has a code page + // identifier, or -1 if the encoding does not represent a code page. + // + + public virtual int CodePage + { + get + { + return m_codePage; + } + } + + // IsAlwaysNormalized + // Returns true if the encoding is always normalized for the specified encoding form + [Pure] + public bool IsAlwaysNormalized() + { + return this.IsAlwaysNormalized(NormalizationForm.FormC); + } + + [Pure] + public virtual bool IsAlwaysNormalized(NormalizationForm form) + { + // Assume false unless the encoding knows otherwise + return false; + } + + // Returns a Decoder object for this encoding. The returned object + // can be used to decode a sequence of bytes into a sequence of characters. + // Contrary to the GetChars family of methods, a Decoder can + // convert partial sequences of bytes into partial sequences of characters + // by maintaining the appropriate state between the conversions. + // + // This default implementation returns a Decoder that simply + // forwards calls to the GetCharCount and GetChars methods to + // the corresponding methods of this encoding. Encodings that require state + // to be maintained between successive conversions should override this + // method and return an instance of an appropriate Decoder + // implementation. + // + + public virtual Decoder GetDecoder() + { + return new DefaultDecoder(this); + } + + // Returns an Encoder object for this encoding. The returned object + // can be used to encode a sequence of characters into a sequence of bytes. + // Contrary to the GetBytes family of methods, an Encoder can + // convert partial sequences of characters into partial sequences of bytes + // by maintaining the appropriate state between the conversions. + // + // This default implementation returns an Encoder that simply + // forwards calls to the GetByteCount and GetBytes methods to + // the corresponding methods of this encoding. Encodings that require state + // to be maintained between successive conversions should override this + // method and return an instance of an appropriate Encoder + // implementation. + // + + public virtual Encoder GetEncoder() + { + return new DefaultEncoder(this); + } + + // Returns the maximum number of bytes required to encode a given number of + // characters. This method can be used to determine an appropriate buffer + // size for byte arrays passed to the GetBytes method of this + // encoding or the GetBytes method of an Encoder for this + // encoding. All encodings must guarantee that no buffer overflow + // exceptions will occur if buffers are sized according to the results of + // this method. + // + // WARNING: If you're using something besides the default replacement encoder fallback, + // then you could have more bytes than this returned from an actual call to GetBytes(). + // + [Pure] + public abstract int GetMaxByteCount(int charCount); + + // Returns the maximum number of characters produced by decoding a given + // number of bytes. This method can be used to determine an appropriate + // buffer size for character arrays passed to the GetChars method of + // this encoding or the GetChars method of a Decoder for this + // encoding. All encodings must guarantee that no buffer overflow + // exceptions will occur if buffers are sized according to the results of + // this method. + // + [Pure] + public abstract int GetMaxCharCount(int byteCount); + + // Returns a string containing the decoded representation of a given byte + // array. + // + [Pure] + public virtual String GetString(byte[] bytes) + { + if (bytes == null) + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + Contract.EndContractBlock(); + + return GetString(bytes, 0, bytes.Length); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // Internally we override this for performance + // + [Pure] + public virtual String GetString(byte[] bytes, int index, int count) + { + return new String(GetChars(bytes, index, count)); + } + + // Returns an encoding for Unicode format. The returned encoding will be + // an instance of the UnicodeEncoding class. + // + // It will use little endian byte order, but will detect + // input in big endian if it finds a byte order mark per Unicode 2.0. + + public static Encoding Unicode => UnicodeEncoding.s_littleEndianDefault; + + // Returns an encoding for Unicode format. The returned encoding will be + // an instance of the UnicodeEncoding class. + // + // It will use big endian byte order, but will detect + // input in little endian if it finds a byte order mark per Unicode 2.0. + + public static Encoding BigEndianUnicode => UnicodeEncoding.s_bigEndianDefault; + + // Returns an encoding for the UTF-7 format. The returned encoding will be + // an instance of the UTF7Encoding class. + + public static Encoding UTF7 => UTF7Encoding.s_default; + + // Returns an encoding for the UTF-8 format. The returned encoding will be + // an instance of the UTF8Encoding class. + + public static Encoding UTF8 => UTF8Encoding.s_default; + + // Returns an encoding for the UTF-32 format. The returned encoding will be + // an instance of the UTF32Encoding class. + + public static Encoding UTF32 => UTF32Encoding.s_default; + + // Returns an encoding for the UTF-32 format. The returned encoding will be + // an instance of the UTF32Encoding class. + // + // It will use big endian byte order. + + private static Encoding BigEndianUTF32 => UTF32Encoding.s_bigEndianDefault; + + public override bool Equals(Object value) + { + Encoding that = value as Encoding; + if (that != null) + return (m_codePage == that.m_codePage) && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + return (false); + } + + + public override int GetHashCode() + { + return m_codePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode(); + } + + internal virtual char[] GetBestFitUnicodeToBytesData() + { + // Normally we don't have any best fit data. + return Array.Empty(); + } + + internal virtual char[] GetBestFitBytesToUnicodeData() + { + // Normally we don't have any best fit data. + return Array.Empty(); + } + + internal void ThrowBytesOverflow() + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount + throw new ArgumentException( + SR.Format(SR.Argument_EncodingConversionOverflowBytes, EncodingName, EncoderFallback.GetType()), "bytes"); + } + + internal void ThrowBytesOverflow(EncoderNLS encoder, bool nothingEncoded) + { + if (encoder == null || encoder.m_throwOnOverflow || nothingEncoded) + { + if (encoder != null && encoder.InternalHasFallbackBuffer) + encoder.FallbackBuffer.InternalReset(); + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount + ThrowBytesOverflow(); + } + + // If we didn't throw, we are in convert and have to remember our flushing + encoder.ClearMustFlush(); + } + + internal void ThrowCharsOverflow() + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implimented a decoder fallback with a broken GetMaxCharCount + throw new ArgumentException( + SR.Format(SR.Argument_EncodingConversionOverflowChars, EncodingName, DecoderFallback.GetType()), "chars"); + } + + internal void ThrowCharsOverflow(DecoderNLS decoder, bool nothingDecoded) + { + if (decoder == null || decoder.m_throwOnOverflow || nothingDecoded) + { + if (decoder != null && decoder.InternalHasFallbackBuffer) + decoder.FallbackBuffer.InternalReset(); + + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implimented a decoder fallback with a broken GetMaxCharCount + ThrowCharsOverflow(); + } + + // If we didn't throw, we are in convert and have to remember our flushing + decoder.ClearMustFlush(); + } + + internal sealed class DefaultEncoder : Encoder, IObjectReference, ISerializable + { + private Encoding m_encoding; + + public DefaultEncoder(Encoding encoding) + { + m_encoding = encoding; + } + + public Object GetRealObject(StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + // Returns the number of bytes the next call to GetBytes will + // produce if presented with the given range of characters and the given + // value of the flush parameter. The returned value takes into + // account the state in which the encoder was left following the last call + // to GetBytes. The state of the encoder is not affected by a call + // to this method. + // + + public override int GetByteCount(char[] chars, int index, int count, bool flush) + { + return m_encoding.GetByteCount(chars, index, count); + } + + [SuppressMessage("Microsoft.Contracts", "CC1055")] // Skip extra error checking to avoid *potential* AppCompat problems. + public unsafe override int GetByteCount(char* chars, int count, bool flush) + { + return m_encoding.GetByteCount(chars, count); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. The method encodes charCount characters from + // chars starting at index charIndex, storing the resulting + // bytes in bytes starting at index byteIndex. The encoding + // takes into account the state in which the encoder was left following the + // last call to this method. The flush parameter indicates whether + // the encoder should flush any shift-states and partial characters at the + // end of the conversion. To ensure correct termination of a sequence of + // blocks of encoded bytes, the last call to GetBytes should specify + // a value of true for the flush parameter. + // + // An exception occurs if the byte array is not large enough to hold the + // complete encoding of the characters. The GetByteCount method can + // be used to determine the exact number of bytes that will be produced for + // a given range of characters. Alternatively, the GetMaxByteCount + // method of the Encoding that produced this encoder can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + + public override int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex, bool flush) + { + return m_encoding.GetBytes(chars, charIndex, charCount, bytes, byteIndex); + } + + [SuppressMessage("Microsoft.Contracts", "CC1055")] // Skip extra error checking to avoid *potential* AppCompat problems. + public unsafe override int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, bool flush) + { + return m_encoding.GetBytes(chars, charCount, bytes, byteCount); + } + } + + internal sealed class DefaultDecoder : Decoder, IObjectReference, ISerializable + { + private Encoding m_encoding; + + public DefaultDecoder(Encoding encoding) + { + m_encoding = encoding; + } + + public Object GetRealObject(StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + // ISerializable implementation + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + // Returns the number of characters the next call to GetChars will + // produce if presented with the given range of bytes. The returned value + // takes into account the state in which the decoder was left following the + // last call to GetChars. The state of the decoder is not affected + // by a call to this method. + // + + public override int GetCharCount(byte[] bytes, int index, int count) + { + return GetCharCount(bytes, index, count, false); + } + + public override int GetCharCount(byte[] bytes, int index, int count, bool flush) + { + return m_encoding.GetCharCount(bytes, index, count); + } + + [SuppressMessage("Microsoft.Contracts", "CC1055")] // Skip extra error checking to avoid *potential* AppCompat problems. + public unsafe override int GetCharCount(byte* bytes, int count, bool flush) + { + // By default just call the encoding version, no flush by default + return m_encoding.GetCharCount(bytes, count); + } + + // Decodes a range of bytes in a byte array into a range of characters + // in a character array. The method decodes byteCount bytes from + // bytes starting at index byteIndex, storing the resulting + // characters in chars starting at index charIndex. The + // decoding takes into account the state in which the decoder was left + // following the last call to this method. + // + // An exception occurs if the character array is not large enough to + // hold the complete decoding of the bytes. The GetCharCount method + // can be used to determine the exact number of characters that will be + // produced for a given range of bytes. Alternatively, the + // GetMaxCharCount method of the Encoding that produced this + // decoder can be used to determine the maximum number of characters that + // will be produced for a given number of bytes, regardless of the actual + // byte values. + // + + public override int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + return GetChars(bytes, byteIndex, byteCount, chars, charIndex, false); + } + + public override int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex, bool flush) + { + return m_encoding.GetChars(bytes, byteIndex, byteCount, chars, charIndex); + } + + [SuppressMessage("Microsoft.Contracts", "CC1055")] // Skip extra error checking to avoid *potential* AppCompat problems. + public unsafe override int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, bool flush) + { + // By default just call the encoding's version + return m_encoding.GetChars(bytes, byteCount, chars, charCount); + } + } + + internal class EncodingCharBuffer + { + private unsafe char* _chars; + private unsafe char* _charStart; + private unsafe char* _charEnd; + private int _charCountResult = 0; + private Encoding _enc; + private DecoderNLS _decoder; + private unsafe byte* _byteStart; + private unsafe byte* _byteEnd; + private unsafe byte* _bytes; + private DecoderFallbackBuffer _fallbackBuffer; + + internal unsafe EncodingCharBuffer(Encoding enc, DecoderNLS decoder, char* charStart, int charCount, + byte* byteStart, int byteCount) + { + _enc = enc; + _decoder = decoder; + + _chars = charStart; + _charStart = charStart; + _charEnd = charStart + charCount; + + _byteStart = byteStart; + _bytes = byteStart; + _byteEnd = byteStart + byteCount; + + if (_decoder == null) + _fallbackBuffer = enc.DecoderFallback.CreateFallbackBuffer(); + else + _fallbackBuffer = _decoder.FallbackBuffer; + + // If we're getting chars or getting char count we don't expect to have + // to remember fallbacks between calls (so it should be empty) + Debug.Assert(_fallbackBuffer.Remaining == 0, + "[Encoding.EncodingCharBuffer.EncodingCharBuffer]Expected empty fallback buffer for getchars/charcount"); + _fallbackBuffer.InternalInitialize(_bytes, _charEnd); + } + + internal unsafe bool AddChar(char ch, int numBytes) + { + if (_chars != null) + { + if (_chars >= _charEnd) + { + // Throw maybe + _bytes -= numBytes; // Didn't encode these bytes + _enc.ThrowCharsOverflow(_decoder, _bytes <= _byteStart); // Throw? + return false; // No throw, but no store either + } + + *(_chars++) = ch; + } + _charCountResult++; + return true; + } + + internal unsafe bool AddChar(char ch) + { + return AddChar(ch, 1); + } + + + internal unsafe bool AddChar(char ch1, char ch2, int numBytes) + { + // Need room for 2 chars + if (_chars >= _charEnd - 1) + { + // Throw maybe + _bytes -= numBytes; // Didn't encode these bytes + _enc.ThrowCharsOverflow(_decoder, _bytes <= _byteStart); // Throw? + return false; // No throw, but no store either + } + return AddChar(ch1, numBytes) && AddChar(ch2, numBytes); + } + + internal unsafe void AdjustBytes(int count) + { + _bytes += count; + } + + internal unsafe bool MoreData + { + get + { + return _bytes < _byteEnd; + } + } + + // Do we have count more bytes? + internal unsafe bool EvenMoreData(int count) + { + return (_bytes <= _byteEnd - count); + } + + // GetNextByte shouldn't be called unless the caller's already checked more data or even more data, + // but we'll double check just to make sure. + internal unsafe byte GetNextByte() + { + Debug.Assert(_bytes < _byteEnd, "[EncodingCharBuffer.GetNextByte]Expected more date"); + if (_bytes >= _byteEnd) + return 0; + return *(_bytes++); + } + + internal unsafe int BytesUsed + { + get + { + return (int)(_bytes - _byteStart); + } + } + + internal unsafe bool Fallback(byte fallbackByte) + { + // Build our buffer + byte[] byteBuffer = new byte[] { fallbackByte }; + + // Do the fallback and add the data. + return Fallback(byteBuffer); + } + + internal unsafe bool Fallback(byte byte1, byte byte2) + { + // Build our buffer + byte[] byteBuffer = new byte[] { byte1, byte2 }; + + // Do the fallback and add the data. + return Fallback(byteBuffer); + } + + internal unsafe bool Fallback(byte byte1, byte byte2, byte byte3, byte byte4) + { + // Build our buffer + byte[] byteBuffer = new byte[] { byte1, byte2, byte3, byte4 }; + + // Do the fallback and add the data. + return Fallback(byteBuffer); + } + + internal unsafe bool Fallback(byte[] byteBuffer) + { + // Do the fallback and add the data. + if (_chars != null) + { + char* pTemp = _chars; + if (_fallbackBuffer.InternalFallback(byteBuffer, _bytes, ref _chars) == false) + { + // Throw maybe + _bytes -= byteBuffer.Length; // Didn't use how many ever bytes we're falling back + _fallbackBuffer.InternalReset(); // We didn't use this fallback. + _enc.ThrowCharsOverflow(_decoder, _chars == _charStart); // Throw? + return false; // No throw, but no store either + } + _charCountResult += unchecked((int)(_chars - pTemp)); + } + else + { + _charCountResult += _fallbackBuffer.InternalFallback(byteBuffer, _bytes); + } + + return true; + } + + internal unsafe int Count + { + get + { + return _charCountResult; + } + } + } + + internal class EncodingByteBuffer + { + private unsafe byte* _bytes; + private unsafe byte* _byteStart; + private unsafe byte* _byteEnd; + private unsafe char* _chars; + private unsafe char* _charStart; + private unsafe char* _charEnd; + private int _byteCountResult = 0; + private Encoding _enc; + private EncoderNLS _encoder; + internal EncoderFallbackBuffer fallbackBuffer; + + internal unsafe EncodingByteBuffer(Encoding inEncoding, EncoderNLS inEncoder, + byte* inByteStart, int inByteCount, char* inCharStart, int inCharCount) + { + _enc = inEncoding; + _encoder = inEncoder; + + _charStart = inCharStart; + _chars = inCharStart; + _charEnd = inCharStart + inCharCount; + + _bytes = inByteStart; + _byteStart = inByteStart; + _byteEnd = inByteStart + inByteCount; + + if (_encoder == null) + this.fallbackBuffer = _enc.EncoderFallback.CreateFallbackBuffer(); + else + { + this.fallbackBuffer = _encoder.FallbackBuffer; + // If we're not converting we must not have data in our fallback buffer + if (_encoder.m_throwOnOverflow && _encoder.InternalHasFallbackBuffer && + this.fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, + _encoder.Encoding.EncodingName, _encoder.Fallback.GetType())); + } + fallbackBuffer.InternalInitialize(_chars, _charEnd, _encoder, _bytes != null); + } + + internal unsafe bool AddByte(byte b, int moreBytesExpected) + { + Debug.Assert(moreBytesExpected >= 0, "[EncodingByteBuffer.AddByte]expected non-negative moreBytesExpected"); + if (_bytes != null) + { + if (_bytes >= _byteEnd - moreBytesExpected) + { + // Throw maybe. Check which buffer to back up (only matters if Converting) + this.MovePrevious(true); // Throw if necessary + return false; // No throw, but no store either + } + + *(_bytes++) = b; + } + _byteCountResult++; + return true; + } + + internal unsafe bool AddByte(byte b1) + { + return (AddByte(b1, 0)); + } + + internal unsafe bool AddByte(byte b1, byte b2) + { + return (AddByte(b1, b2, 0)); + } + + internal unsafe bool AddByte(byte b1, byte b2, int moreBytesExpected) + { + return (AddByte(b1, 1 + moreBytesExpected) && AddByte(b2, moreBytesExpected)); + } + + internal unsafe bool AddByte(byte b1, byte b2, byte b3) + { + return AddByte(b1, b2, b3, (int)0); + } + + internal unsafe bool AddByte(byte b1, byte b2, byte b3, int moreBytesExpected) + { + return (AddByte(b1, 2 + moreBytesExpected) && + AddByte(b2, 1 + moreBytesExpected) && + AddByte(b3, moreBytesExpected)); + } + + internal unsafe bool AddByte(byte b1, byte b2, byte b3, byte b4) + { + return (AddByte(b1, 3) && + AddByte(b2, 2) && + AddByte(b3, 1) && + AddByte(b4, 0)); + } + + internal unsafe void MovePrevious(bool bThrow) + { + if (fallbackBuffer.bFallingBack) + fallbackBuffer.MovePrevious(); // don't use last fallback + else + { + Debug.Assert(_chars > _charStart || + ((bThrow == true) && (_bytes == _byteStart)), + "[EncodingByteBuffer.MovePrevious]expected previous data or throw"); + if (_chars > _charStart) + _chars--; // don't use last char + } + + if (bThrow) + _enc.ThrowBytesOverflow(_encoder, _bytes == _byteStart); // Throw? (and reset fallback if not converting) + } + + internal unsafe bool Fallback(char charFallback) + { + // Do the fallback + return fallbackBuffer.InternalFallback(charFallback, ref _chars); + } + + internal unsafe bool MoreData + { + get + { + // See if fallbackBuffer is not empty or if there's data left in chars buffer. + return ((fallbackBuffer.Remaining > 0) || (_chars < _charEnd)); + } + } + + internal unsafe char GetNextChar() + { + // See if there's something in our fallback buffer + char cReturn = fallbackBuffer.InternalGetNextChar(); + + // Nothing in the fallback buffer, return our normal data. + if (cReturn == 0) + { + if (_chars < _charEnd) + cReturn = *(_chars++); + } + + return cReturn; + } + + internal unsafe int CharsUsed + { + get + { + return (int)(_chars - _charStart); + } + } + + internal unsafe int Count + { + get + { + return _byteCountResult; + } + } + } + } +} diff --git a/src/mscorlib/shared/System/Text/Latin1Encoding.cs b/src/mscorlib/shared/System/Text/Latin1Encoding.cs new file mode 100644 index 0000000..3f65f55 --- /dev/null +++ b/src/mscorlib/shared/System/Text/Latin1Encoding.cs @@ -0,0 +1,894 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Runtime.Serialization; + +namespace System.Text +{ + // + // Latin1Encoding is a simple override to optimize the GetString version of Latin1Encoding. + // because of the best fit cases we can't do this when encoding the string, only when decoding + // + internal class Latin1Encoding : EncodingNLS, ISerializable + { + // Used by Encoding.Latin1 for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly Latin1Encoding s_default = new Latin1Encoding(); + + // We only use the best-fit table, of which ASCII is a superset for us. + public Latin1Encoding() : base(Encoding.ISO_8859_1) + { + } + + // ISerializable implementation + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + // GetByteCount + // Note: We start by assuming that the output will be the same as count. Having + // an encoder or fallback may change that assumption + internal override unsafe int GetByteCount(char* chars, int charCount, EncoderNLS encoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(charCount >= 0, "[Latin1Encoding.GetByteCount]count is negative"); + Debug.Assert(chars != null, "[Latin1Encoding.GetByteCount]chars is null"); + + // Assert because we shouldn't be able to have a null encoder. + Debug.Assert(encoderFallback != null, "[Latin1Encoding.GetByteCount]Attempting to use null fallback encoder"); + + char charLeftOver = (char)0; + + // If we have an encoder AND we aren't using default fallback, + // then we may have a complicated count. + EncoderReplacementFallback fallback; + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), + "[Latin1Encoding.GetByteCount]leftover character should be high surrogate"); + + fallback = encoder.Fallback as EncoderReplacementFallback; + + // Verify that we have no fallbackbuffer, for Latin1 its always empty, so just assert + Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || + encoder.FallbackBuffer.Remaining == 0, + "[Latin1CodePageEncoding.GetByteCount]Expected empty fallback buffer"); + } + else + fallback = this.EncoderFallback as EncoderReplacementFallback; + + if ((fallback != null && fallback.MaxCharCount == 1)/* || bIsBestFit*/) + { + // Replacement fallback encodes surrogate pairs as two ?? (or two whatever), so return size is always + // same as input size. + // Note that no existing SBCS code pages map code points to supplimentary characters, so this is easy. + + // We could however have 1 extra byte if the last call had an encoder and a funky fallback and + // if we don't use the funky fallback this time. + + // Do we have an extra char left over from last time? + if (charLeftOver > 0) + charCount++; + + return (charCount); + } + + // Count is more complicated if you have a funky fallback + // For fallback we may need a fallback buffer, we know we're not default fallback + int byteCount = 0; + + // Start by assuming default count, then +/- for fallback characters + char* charEnd = chars + charCount; + + // For fallback we may need a fallback buffer, we know we aren't default fallback. + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + // Initialize the buffer + Debug.Assert(encoder != null, + "[Latin1Encoding.GetByteCount]Expected encoder if we have charLeftOver"); + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); + + // Since left over char was a surrogate, it'll have to be fallen back. + // Get Fallback + // This will fallback a pair if *chars is a low surrogate + charsForFallback = chars; + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + } + + // Now we may have fallback char[] already from the encoder + + // Go ahead and do it, including the fallback. + char ch; + while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for fallback, this'll catch surrogate pairs too. + // no chars >= 0x100 are allowed. + if (ch > 0xff) + { + // Initialize the buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, false); + } + + // Get Fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // We'll use this one + byteCount++; + } + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[Latin1Encoding.GetByteCount]Expected Empty fallback buffer"); + + return byteCount; + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(bytes != null, "[Latin1Encoding.GetBytes]bytes is null"); + Debug.Assert(byteCount >= 0, "[Latin1Encoding.GetBytes]byteCount is negative"); + Debug.Assert(chars != null, "[Latin1Encoding.GetBytes]chars is null"); + Debug.Assert(charCount >= 0, "[Latin1Encoding.GetBytes]charCount is negative"); + + // Assert because we shouldn't be able to have a null encoder. + Debug.Assert(encoderFallback != null, "[Latin1Encoding.GetBytes]Attempting to use null encoder fallback"); + + // Get any left over characters & check fast or slower fallback type + char charLeftOver = (char)0; + EncoderReplacementFallback fallback = null; + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + fallback = encoder.Fallback as EncoderReplacementFallback; + Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), + "[Latin1Encoding.GetBytes]leftover character should be high surrogate"); + + // Verify that we have no fallbackbuffer, for ASCII its always empty, so just assert + Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || + encoder.FallbackBuffer.Remaining == 0, + "[Latin1CodePageEncoding.GetBytes]Expected empty fallback buffer"); + } + else + { + fallback = this.EncoderFallback as EncoderReplacementFallback; + } + + // prepare our end + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + // See if we do the fast default or slightly slower fallback + if (fallback != null && fallback.MaxCharCount == 1) + { + // Fast version + char cReplacement = fallback.DefaultString[0]; + + // Check for replacements in range, otherwise fall back to slow version. + if (cReplacement <= (char)0xff) + { + // We should have exactly as many output bytes as input bytes, unless there's a left + // over character, in which case we may need one more. + + // If we had a left over character will have to add a ? (This happens if they had a funky + // fallback last time, but not this time.) (We can't spit any out though + // because with fallback encoder each surrogate is treated as a seperate code point) + if (charLeftOver > 0) + { + // Have to have room + // Throw even if doing no throw version because this is just 1 char, + // so buffer will never be big enough + if (byteCount == 0) + ThrowBytesOverflow(encoder, true); + + // This'll make sure we still have more room and also make sure our return value is correct. + *(bytes++) = (byte)cReplacement; + byteCount--; // We used one of the ones we were counting. + } + + // This keeps us from overrunning our output buffer + if (byteCount < charCount) + { + // Throw or make buffer smaller? + ThrowBytesOverflow(encoder, byteCount < 1); + + // Just use what we can + charEnd = chars + byteCount; + } + + // We just do a quick copy + while (chars < charEnd) + { + char ch2 = *(chars++); + if (ch2 > 0x00ff) *(bytes++) = (byte)cReplacement; + else *(bytes++) = (byte)ch2; + } + + // Clear encoder + if (encoder != null) + { + encoder.charLeftOver = (char)0; + encoder.m_charsUsed = (int)(chars - charStart); + } + return (int)(bytes - byteStart); + } + } + + // Slower version, have to do real fallback. + + // prepare our end + byte* byteEnd = bytes + byteCount; + + // For fallback we may need a fallback buffer, we know we aren't default fallback, create & init it + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + // Since left over char was a surrogate, it'll have to be fallen back. + // Get Fallback + Debug.Assert(encoder != null, + "[Latin1Encoding.GetBytes]Expected encoder if we have charLeftOver"); + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(chars, charEnd, encoder, true); + + // Since left over char was a surrogate, it'll have to be fallen back. + // Get Fallback + // This will fallback a pair if *chars is a low surrogate + charsForFallback = chars; + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + if (fallbackBuffer.Remaining > byteEnd - bytes) + { + // Throw it, if we don't have enough for this we never will + ThrowBytesOverflow(encoder, true); + } + } + + // Now we may have fallback char[] already from the encoder fallback above + + // Go ahead and do it, including the fallback. + char ch; + while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for fallback, this'll catch surrogate pairs too. + // All characters >= 0x100 must fall back. + if (ch > 0xff) + { + // Initialize the buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, true); + } + + // Get Fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + + // Make sure we have enough room. Each fallback char will be 1 output char + // (or else cause a recursion exception) + if (fallbackBuffer.Remaining > byteEnd - bytes) + { + // Didn't use this char, throw it. Chars should've advanced by now + // If we had encoder fallback data it would've thrown before the loop + Debug.Assert(chars > charStart, + "[Latin1Encoding.GetBytes]Expected chars to have advanced (fallback case)"); + chars--; + fallbackBuffer.InternalReset(); + + // Throw it + ThrowBytesOverflow(encoder, chars == charStart); + break; + } + + continue; + } + + // We'll use this one + // Bounds check + if (bytes >= byteEnd) + { + // didn't use this char, we'll throw or use buffer + Debug.Assert(fallbackBuffer == null || fallbackBuffer.bFallingBack == false, + "[Latin1Encoding.GetBytes]Expected fallback to have throw initially if insufficient space"); + if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) + { + Debug.Assert(chars > charStart, + "[Latin1Encoding.GetBytes]Expected chars to have advanced (fallback case)"); + chars--; // don't use last char + } + ThrowBytesOverflow(encoder, chars == charStart); // throw ? + break; // don't throw, stop + } + + // Go ahead and add it + *bytes = unchecked((byte)ch); + bytes++; + } + + // Need to do encoder stuff + if (encoder != null) + { + // Fallback stuck it in encoder if necessary, but we have to clear MustFlush cases + if (fallbackBuffer != null && !fallbackBuffer.bUsedEncoder) + // Clear it in case of MustFlush + encoder.charLeftOver = (char)0; + + // Set our chars used count + encoder.m_charsUsed = (int)(chars - charStart); + } + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[Latin1Encoding.GetBytes]Expected Empty fallback buffer"); + + return (int)(bytes - byteStart); + } + + // This is internal and called by something else, + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) + { + // Just assert, we're called internally so these should be safe, checked already + Debug.Assert(bytes != null, "[Latin1Encoding.GetCharCount]bytes is null"); + Debug.Assert(count >= 0, "[Latin1Encoding.GetCharCount]byteCount is negative"); + + // Just return length, SBCS stay the same length because they don't map to surrogate + // pairs and we don't have to fallback because all latin1Encoding code points are unicode + return count; + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS decoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(bytes != null, "[Latin1Encoding.GetChars]bytes is null"); + Debug.Assert(byteCount >= 0, "[Latin1Encoding.GetChars]byteCount is negative"); + Debug.Assert(chars != null, "[Latin1Encoding.GetChars]chars is null"); + Debug.Assert(charCount >= 0, "[Latin1Encoding.GetChars]charCount is negative"); + + // Need byteCount chars, otherwise too small buffer + if (charCount < byteCount) + { + // Buffer too small. Do we throw? + ThrowCharsOverflow(decoder, charCount < 1); + + // Don't throw, correct buffer size + byteCount = charCount; + } + + // Do it our fast way + byte* byteEnd = bytes + byteCount; + + // Quick loop, all bytes are the same as chars, so no fallbacks for latin1 + while (bytes < byteEnd) + { + *(chars) = unchecked((char)*(bytes)); + chars++; + bytes++; + } + + // Might need to know input bytes used + if (decoder != null) + decoder.m_bytesUsed = byteCount; + + // Converted sequence is same length as input, so output charsUsed is same as byteCount; + return byteCount; + } + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // 1 to 1 for most characters. Only surrogates with fallbacks have less. + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + return (int)byteCount; + } + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Just return length, SBCS stay the same length because they don't map to surrogate + long charCount = (long)byteCount; + + // 1 to 1 for most characters. Only surrogates with fallbacks have less, unknown fallbacks could be longer. + if (DecoderFallback.MaxCharCount > 1) + charCount *= DecoderFallback.MaxCharCount; + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) + public override bool IsSingleByte + { + get + { + return true; + } + } + + public override bool IsAlwaysNormalized(NormalizationForm form) + { + // Latin-1 contains precomposed characters, so normal for Form C. + // Since some are composed, not normal for D & KD. + // Also some letters like 0x00A8 (spacing diarisis) have compatibility decompositions, so false for KD & KC. + + // Only true for form C. + return (form == NormalizationForm.FormC); + } + // Since our best fit table is small we'll hard code it + internal override char[] GetBestFitUnicodeToBytesData() + { + // Get our best fit data + return Latin1Encoding.arrayCharBestFit; + } + + // Best fit for ASCII, and since it works for ASCII, we use it for latin1 as well. + private static readonly char[] arrayCharBestFit = + { +// The first many are in case you wanted to use this for ASCIIEncoding, which we don't need to do any more. +// (char)0x00a0, (char)0x0020, // No-Break Space -> Space +// (char)0x00a1, (char)0x0021, // Inverted Exclamation Mark -> ! +// (char)0x00a2, (char)0x0063, // Cent Sign -> c +// (char)0x00a3, (char)0x003f, // Pound Sign +// (char)0x00a4, (char)0x0024, // Currency Sign -> $ +// (char)0x00a5, (char)0x0059, // Yen Sign -> Y +// (char)0x00a6, (char)0x007c, // Broken Bar -> | +// (char)0x00a7, (char)0x003f, // Section Sign +// (char)0x00a8, (char)0x003f, // Diaeresis +// (char)0x00a9, (char)0x0043, // Copyright Sign -> C +// (char)0x00aa, (char)0x0061, // Feminine Ordinal Indicator -> a +// (char)0x00ab, (char)0x003c, // Left-Pointing Double Angle Quotation Mark -> < +// (char)0x00ac, (char)0x003f, // Not Sign +// (char)0x00ad, (char)0x002d, // Soft Hyphen -> - +// (char)0x00ae, (char)0x0052, // Registered Sign -> R +// (char)0x00af, (char)0x003f, // Macron +// (char)0x00b0, (char)0x003f, // Degree Sign +// (char)0x00b1, (char)0x003f, // Plus-Minus Sign +// (char)0x00b2, (char)0x0032, // Superscript Two -> 2 +// (char)0x00b3, (char)0x0033, // Superscript Three -> 3 +// (char)0x00b4, (char)0x003f, // Acute Accent +// (char)0x00b5, (char)0x003f, // Micro Sign +// (char)0x00b6, (char)0x003f, // Pilcrow Sign +// (char)0x00b7, (char)0x002e, // Middle Dot -> . +// (char)0x00b8, (char)0x002c, // Cedilla -> , +// (char)0x00b9, (char)0x0031, // Superscript One -> 1 +// (char)0x00ba, (char)0x006f, // Masculine Ordinal Indicator -> o +// (char)0x00bb, (char)0x003e, // Right-Pointing Double Angle Quotation Mark -> > +// (char)0x00bc, (char)0x003f, // Vulgar Fraction One Quarter +// (char)0x00bd, (char)0x003f, // Vulgar Fraction One Half +// (char)0x00be, (char)0x003f, // Vulgar Fraction Three Quarters +// (char)0x00bf, (char)0x003f, // Inverted Question Mark +// (char)0x00c0, (char)0x0041, // Latin Capital Letter A With Grave -> A +// (char)0x00c1, (char)0x0041, // Latin Capital Letter A With Acute -> A +// (char)0x00c2, (char)0x0041, // Latin Capital Letter A With Circumflex -> A +// (char)0x00c3, (char)0x0041, // Latin Capital Letter A With Tilde -> A +// (char)0x00c4, (char)0x0041, // Latin Capital Letter A With Diaeresis -> A +// (char)0x00c5, (char)0x0041, // Latin Capital Letter A With Ring Above -> A +// (char)0x00c6, (char)0x0041, // Latin Capital Ligature Ae -> A +// (char)0x00c7, (char)0x0043, // Latin Capital Letter C With Cedilla -> C +// (char)0x00c8, (char)0x0045, // Latin Capital Letter E With Grave -> E +// (char)0x00c9, (char)0x0045, // Latin Capital Letter E With Acute -> E +// (char)0x00ca, (char)0x0045, // Latin Capital Letter E With Circumflex -> E +// (char)0x00cb, (char)0x0045, // Latin Capital Letter E With Diaeresis -> E +// (char)0x00cc, (char)0x0049, // Latin Capital Letter I With Grave -> I +// (char)0x00cd, (char)0x0049, // Latin Capital Letter I With Acute -> I +// (char)0x00ce, (char)0x0049, // Latin Capital Letter I With Circumflex -> I +// (char)0x00cf, (char)0x0049, // Latin Capital Letter I With Diaeresis -> I +// (char)0x00d0, (char)0x0044, // Latin Capital Letter Eth -> D +// (char)0x00d1, (char)0x004e, // Latin Capital Letter N With Tilde -> N +// (char)0x00d2, (char)0x004f, // Latin Capital Letter O With Grave -> O +// (char)0x00d3, (char)0x004f, // Latin Capital Letter O With Acute -> O +// (char)0x00d4, (char)0x004f, // Latin Capital Letter O With Circumflex -> O +// (char)0x00d5, (char)0x004f, // Latin Capital Letter O With Tilde -> O +// (char)0x00d6, (char)0x004f, // Latin Capital Letter O With Diaeresis -> O +// (char)0x00d7, (char)0x003f, // Multiplication Sign +// (char)0x00d8, (char)0x004f, // Latin Capital Letter O With Stroke -> O +// (char)0x00d9, (char)0x0055, // Latin Capital Letter U With Grave -> U +// (char)0x00da, (char)0x0055, // Latin Capital Letter U With Acute -> U +// (char)0x00db, (char)0x0055, // Latin Capital Letter U With Circumflex -> U +// (char)0x00dc, (char)0x0055, // Latin Capital Letter U With Diaeresis -> U +// (char)0x00dd, (char)0x0059, // Latin Capital Letter Y With Acute -> Y +// (char)0x00de, (char)0x003f, // Latin Capital Letter Thorn +// (char)0x00df, (char)0x003f, // Latin Small Letter Sharp S +// (char)0x00e0, (char)0x0061, // Latin Small Letter A With Grave -> a +// (char)0x00e1, (char)0x0061, // Latin Small Letter A With Acute -> a +// (char)0x00e2, (char)0x0061, // Latin Small Letter A With Circumflex -> a +// (char)0x00e3, (char)0x0061, // Latin Small Letter A With Tilde -> a +// (char)0x00e4, (char)0x0061, // Latin Small Letter A With Diaeresis -> a +// (char)0x00e5, (char)0x0061, // Latin Small Letter A With Ring Above -> a +// (char)0x00e6, (char)0x0061, // Latin Small Ligature Ae -> a +// (char)0x00e7, (char)0x0063, // Latin Small Letter C With Cedilla -> c +// (char)0x00e8, (char)0x0065, // Latin Small Letter E With Grave -> e +// (char)0x00e9, (char)0x0065, // Latin Small Letter E With Acute -> e +// (char)0x00ea, (char)0x0065, // Latin Small Letter E With Circumflex -> e +// (char)0x00eb, (char)0x0065, // Latin Small Letter E With Diaeresis -> e +// (char)0x00ec, (char)0x0069, // Latin Small Letter I With Grave -> i +// (char)0x00ed, (char)0x0069, // Latin Small Letter I With Acute -> i +// (char)0x00ee, (char)0x0069, // Latin Small Letter I With Circumflex -> i +// (char)0x00ef, (char)0x0069, // Latin Small Letter I With Diaeresis -> i +// (char)0x00f0, (char)0x003f, // Latin Small Letter Eth +// (char)0x00f1, (char)0x006e, // Latin Small Letter N With Tilde -> n +// (char)0x00f2, (char)0x006f, // Latin Small Letter O With Grave -> o +// (char)0x00f3, (char)0x006f, // Latin Small Letter O With Acute -> o +// (char)0x00f4, (char)0x006f, // Latin Small Letter O With Circumflex -> o +// (char)0x00f5, (char)0x006f, // Latin Small Letter O With Tilde -> o +// (char)0x00f6, (char)0x006f, // Latin Small Letter O With Diaeresis -> o +// (char)0x00f7, (char)0x003f, // Division Sign +// (char)0x00f8, (char)0x006f, // Latin Small Letter O With Stroke -> o +// (char)0x00f9, (char)0x0075, // Latin Small Letter U With Grave -> u +// (char)0x00fa, (char)0x0075, // Latin Small Letter U With Acute -> u +// (char)0x00fb, (char)0x0075, // Latin Small Letter U With Circumflex -> u +// (char)0x00fc, (char)0x0075, // Latin Small Letter U With Diaeresis -> u +// (char)0x00fd, (char)0x0079, // Latin Small Letter Y With Acute -> y +// (char)0x00fe, (char)0x003f, // Latin Small Letter Thorn +// (char)0x00ff, (char)0x0079, // Latin Small Letter Y With Diaeresis -> y + (char)0x0100, (char)0x0041, // Latin Capital Letter A With Macron -> A + (char)0x0101, (char)0x0061, // Latin Small Letter A With Macron -> a + (char)0x0102, (char)0x0041, // Latin Capital Letter A With Breve -> A + (char)0x0103, (char)0x0061, // Latin Small Letter A With Breve -> a + (char)0x0104, (char)0x0041, // Latin Capital Letter A With Ogonek -> A + (char)0x0105, (char)0x0061, // Latin Small Letter A With Ogonek -> a + (char)0x0106, (char)0x0043, // Latin Capital Letter C With Acute -> C + (char)0x0107, (char)0x0063, // Latin Small Letter C With Acute -> c + (char)0x0108, (char)0x0043, // Latin Capital Letter C With Circumflex -> C + (char)0x0109, (char)0x0063, // Latin Small Letter C With Circumflex -> c + (char)0x010a, (char)0x0043, // Latin Capital Letter C With Dot Above -> C + (char)0x010b, (char)0x0063, // Latin Small Letter C With Dot Above -> c + (char)0x010c, (char)0x0043, // Latin Capital Letter C With Caron -> C + (char)0x010d, (char)0x0063, // Latin Small Letter C With Caron -> c + (char)0x010e, (char)0x0044, // Latin Capital Letter D With Caron -> D + (char)0x010f, (char)0x0064, // Latin Small Letter D With Caron -> d + (char)0x0110, (char)0x0044, // Latin Capital Letter D With Stroke -> D + (char)0x0111, (char)0x0064, // Latin Small Letter D With Stroke -> d + (char)0x0112, (char)0x0045, // Latin Capital Letter E With Macron -> E + (char)0x0113, (char)0x0065, // Latin Small Letter E With Macron -> e + (char)0x0114, (char)0x0045, // Latin Capital Letter E With Breve -> E + (char)0x0115, (char)0x0065, // Latin Small Letter E With Breve -> e + (char)0x0116, (char)0x0045, // Latin Capital Letter E With Dot Above -> E + (char)0x0117, (char)0x0065, // Latin Small Letter E With Dot Above -> e + (char)0x0118, (char)0x0045, // Latin Capital Letter E With Ogonek -> E + (char)0x0119, (char)0x0065, // Latin Small Letter E With Ogonek -> e + (char)0x011a, (char)0x0045, // Latin Capital Letter E With Caron -> E + (char)0x011b, (char)0x0065, // Latin Small Letter E With Caron -> e + (char)0x011c, (char)0x0047, // Latin Capital Letter G With Circumflex -> G + (char)0x011d, (char)0x0067, // Latin Small Letter G With Circumflex -> g + (char)0x011e, (char)0x0047, // Latin Capital Letter G With Breve -> G + (char)0x011f, (char)0x0067, // Latin Small Letter G With Breve -> g + (char)0x0120, (char)0x0047, // Latin Capital Letter G With Dot Above -> G + (char)0x0121, (char)0x0067, // Latin Small Letter G With Dot Above -> g + (char)0x0122, (char)0x0047, // Latin Capital Letter G With Cedilla -> G + (char)0x0123, (char)0x0067, // Latin Small Letter G With Cedilla -> g + (char)0x0124, (char)0x0048, // Latin Capital Letter H With Circumflex -> H + (char)0x0125, (char)0x0068, // Latin Small Letter H With Circumflex -> h + (char)0x0126, (char)0x0048, // Latin Capital Letter H With Stroke -> H + (char)0x0127, (char)0x0068, // Latin Small Letter H With Stroke -> h + (char)0x0128, (char)0x0049, // Latin Capital Letter I With Tilde -> I + (char)0x0129, (char)0x0069, // Latin Small Letter I With Tilde -> i + (char)0x012a, (char)0x0049, // Latin Capital Letter I With Macron -> I + (char)0x012b, (char)0x0069, // Latin Small Letter I With Macron -> i + (char)0x012c, (char)0x0049, // Latin Capital Letter I With Breve -> I + (char)0x012d, (char)0x0069, // Latin Small Letter I With Breve -> i + (char)0x012e, (char)0x0049, // Latin Capital Letter I With Ogonek -> I + (char)0x012f, (char)0x0069, // Latin Small Letter I With Ogonek -> i + (char)0x0130, (char)0x0049, // Latin Capital Letter I With Dot Above -> I + (char)0x0131, (char)0x0069, // Latin Small Letter Dotless I -> i + (char)0x0134, (char)0x004a, // Latin Capital Letter J With Circumflex -> J + (char)0x0135, (char)0x006a, // Latin Small Letter J With Circumflex -> j + (char)0x0136, (char)0x004b, // Latin Capital Letter K With Cedilla -> K + (char)0x0137, (char)0x006b, // Latin Small Letter K With Cedilla -> k + (char)0x0139, (char)0x004c, // Latin Capital Letter L With Acute -> L + (char)0x013a, (char)0x006c, // Latin Small Letter L With Acute -> l + (char)0x013b, (char)0x004c, // Latin Capital Letter L With Cedilla -> L + (char)0x013c, (char)0x006c, // Latin Small Letter L With Cedilla -> l + (char)0x013d, (char)0x004c, // Latin Capital Letter L With Caron -> L + (char)0x013e, (char)0x006c, // Latin Small Letter L With Caron -> l + (char)0x0141, (char)0x004c, // Latin Capital Letter L With Stroke -> L + (char)0x0142, (char)0x006c, // Latin Small Letter L With Stroke -> l + (char)0x0143, (char)0x004e, // Latin Capital Letter N With Acute -> N + (char)0x0144, (char)0x006e, // Latin Small Letter N With Acute -> n + (char)0x0145, (char)0x004e, // Latin Capital Letter N With Cedilla -> N + (char)0x0146, (char)0x006e, // Latin Small Letter N With Cedilla -> n + (char)0x0147, (char)0x004e, // Latin Capital Letter N With Caron -> N + (char)0x0148, (char)0x006e, // Latin Small Letter N With Caron -> n + (char)0x014c, (char)0x004f, // Latin Capital Letter O With Macron -> O + (char)0x014d, (char)0x006f, // Latin Small Letter O With Macron -> o + (char)0x014e, (char)0x004f, // Latin Capital Letter O With Breve -> O + (char)0x014f, (char)0x006f, // Latin Small Letter O With Breve -> o + (char)0x0150, (char)0x004f, // Latin Capital Letter O With Double Acute -> O + (char)0x0151, (char)0x006f, // Latin Small Letter O With Double Acute -> o + (char)0x0152, (char)0x004f, // Latin Capital Ligature Oe -> O + (char)0x0153, (char)0x006f, // Latin Small Ligature Oe -> o + (char)0x0154, (char)0x0052, // Latin Capital Letter R With Acute -> R + (char)0x0155, (char)0x0072, // Latin Small Letter R With Acute -> r + (char)0x0156, (char)0x0052, // Latin Capital Letter R With Cedilla -> R + (char)0x0157, (char)0x0072, // Latin Small Letter R With Cedilla -> r + (char)0x0158, (char)0x0052, // Latin Capital Letter R With Caron -> R + (char)0x0159, (char)0x0072, // Latin Small Letter R With Caron -> r + (char)0x015a, (char)0x0053, // Latin Capital Letter S With Acute -> S + (char)0x015b, (char)0x0073, // Latin Small Letter S With Acute -> s + (char)0x015c, (char)0x0053, // Latin Capital Letter S With Circumflex -> S + (char)0x015d, (char)0x0073, // Latin Small Letter S With Circumflex -> s + (char)0x015e, (char)0x0053, // Latin Capital Letter S With Cedilla -> S + (char)0x015f, (char)0x0073, // Latin Small Letter S With Cedilla -> s + (char)0x0160, (char)0x0053, // Latin Capital Letter S With Caron -> S + (char)0x0161, (char)0x0073, // Latin Small Letter S With Caron -> s + (char)0x0162, (char)0x0054, // Latin Capital Letter T With Cedilla -> T + (char)0x0163, (char)0x0074, // Latin Small Letter T With Cedilla -> t + (char)0x0164, (char)0x0054, // Latin Capital Letter T With Caron -> T + (char)0x0165, (char)0x0074, // Latin Small Letter T With Caron -> t + (char)0x0166, (char)0x0054, // Latin Capital Letter T With Stroke -> T + (char)0x0167, (char)0x0074, // Latin Small Letter T With Stroke -> t + (char)0x0168, (char)0x0055, // Latin Capital Letter U With Tilde -> U + (char)0x0169, (char)0x0075, // Latin Small Letter U With Tilde -> u + (char)0x016a, (char)0x0055, // Latin Capital Letter U With Macron -> U + (char)0x016b, (char)0x0075, // Latin Small Letter U With Macron -> u + (char)0x016c, (char)0x0055, // Latin Capital Letter U With Breve -> U + (char)0x016d, (char)0x0075, // Latin Small Letter U With Breve -> u + (char)0x016e, (char)0x0055, // Latin Capital Letter U With Ring Above -> U + (char)0x016f, (char)0x0075, // Latin Small Letter U With Ring Above -> u + (char)0x0170, (char)0x0055, // Latin Capital Letter U With Double Acute -> U + (char)0x0171, (char)0x0075, // Latin Small Letter U With Double Acute -> u + (char)0x0172, (char)0x0055, // Latin Capital Letter U With Ogonek -> U + (char)0x0173, (char)0x0075, // Latin Small Letter U With Ogonek -> u + (char)0x0174, (char)0x0057, // Latin Capital Letter W With Circumflex -> W + (char)0x0175, (char)0x0077, // Latin Small Letter W With Circumflex -> w + (char)0x0176, (char)0x0059, // Latin Capital Letter Y With Circumflex -> Y + (char)0x0177, (char)0x0079, // Latin Small Letter Y With Circumflex -> y + (char)0x0178, (char)0x0059, // Latin Capital Letter Y With Diaeresis -> Y + (char)0x0179, (char)0x005a, // Latin Capital Letter Z With Acute -> Z + (char)0x017a, (char)0x007a, // Latin Small Letter Z With Acute -> z + (char)0x017b, (char)0x005a, // Latin Capital Letter Z With Dot Above -> Z + (char)0x017c, (char)0x007a, // Latin Small Letter Z With Dot Above -> z + (char)0x017d, (char)0x005a, // Latin Capital Letter Z With Caron -> Z + (char)0x017e, (char)0x007a, // Latin Small Letter Z With Caron -> z + (char)0x0180, (char)0x0062, // Latin Small Letter B With Stroke -> b + (char)0x0189, (char)0x0044, // Latin Capital Letter African D -> D + (char)0x0191, (char)0x0046, // Latin Capital Letter F With Hook -> F + (char)0x0192, (char)0x0066, // Latin Small Letter F With Hook -> f + (char)0x0197, (char)0x0049, // Latin Capital Letter I With Stroke -> I + (char)0x019a, (char)0x006c, // Latin Small Letter L With Bar -> l + (char)0x019f, (char)0x004f, // Latin Capital Letter O With Middle Tilde -> O + (char)0x01a0, (char)0x004f, // Latin Capital Letter O With Horn -> O + (char)0x01a1, (char)0x006f, // Latin Small Letter O With Horn -> o + (char)0x01ab, (char)0x0074, // Latin Small Letter T With Palatal Hook -> t + (char)0x01ae, (char)0x0054, // Latin Capital Letter T With Retroflex Hook -> T + (char)0x01af, (char)0x0055, // Latin Capital Letter U With Horn -> U + (char)0x01b0, (char)0x0075, // Latin Small Letter U With Horn -> u + (char)0x01b6, (char)0x007a, // Latin Small Letter Z With Stroke -> z + (char)0x01cd, (char)0x0041, // Latin Capital Letter A With Caron -> A + (char)0x01ce, (char)0x0061, // Latin Small Letter A With Caron -> a + (char)0x01cf, (char)0x0049, // Latin Capital Letter I With Caron -> I + (char)0x01d0, (char)0x0069, // Latin Small Letter I With Caron -> i + (char)0x01d1, (char)0x004f, // Latin Capital Letter O With Caron -> O + (char)0x01d2, (char)0x006f, // Latin Small Letter O With Caron -> o + (char)0x01d3, (char)0x0055, // Latin Capital Letter U With Caron -> U + (char)0x01d4, (char)0x0075, // Latin Small Letter U With Caron -> u + (char)0x01d5, (char)0x0055, // Latin Capital Letter U With Diaeresis And Macron -> U + (char)0x01d6, (char)0x0075, // Latin Small Letter U With Diaeresis And Macron -> u + (char)0x01d7, (char)0x0055, // Latin Capital Letter U With Diaeresis And Acute -> U + (char)0x01d8, (char)0x0075, // Latin Small Letter U With Diaeresis And Acute -> u + (char)0x01d9, (char)0x0055, // Latin Capital Letter U With Diaeresis And Caron -> U + (char)0x01da, (char)0x0075, // Latin Small Letter U With Diaeresis And Caron -> u + (char)0x01db, (char)0x0055, // Latin Capital Letter U With Diaeresis And Grave -> U + (char)0x01dc, (char)0x0075, // Latin Small Letter U With Diaeresis And Grave -> u + (char)0x01de, (char)0x0041, // Latin Capital Letter A With Diaeresis And Macron -> A + (char)0x01df, (char)0x0061, // Latin Small Letter A With Diaeresis And Macron -> a + (char)0x01e4, (char)0x0047, // Latin Capital Letter G With Stroke -> G + (char)0x01e5, (char)0x0067, // Latin Small Letter G With Stroke -> g + (char)0x01e6, (char)0x0047, // Latin Capital Letter G With Caron -> G + (char)0x01e7, (char)0x0067, // Latin Small Letter G With Caron -> g + (char)0x01e8, (char)0x004b, // Latin Capital Letter K With Caron -> K + (char)0x01e9, (char)0x006b, // Latin Small Letter K With Caron -> k + (char)0x01ea, (char)0x004f, // Latin Capital Letter O With Ogonek -> O + (char)0x01eb, (char)0x006f, // Latin Small Letter O With Ogonek -> o + (char)0x01ec, (char)0x004f, // Latin Capital Letter O With Ogonek And Macron -> O + (char)0x01ed, (char)0x006f, // Latin Small Letter O With Ogonek And Macron -> o + (char)0x01f0, (char)0x006a, // Latin Small Letter J With Caron -> j + (char)0x0261, (char)0x0067, // Latin Small Letter Script G -> g + (char)0x02b9, (char)0x0027, // Modifier Letter Prime -> ' + (char)0x02ba, (char)0x0022, // Modifier Letter Double Prime -> " + (char)0x02bc, (char)0x0027, // Modifier Letter Apostrophe -> ' + (char)0x02c4, (char)0x005e, // Modifier Letter Up Arrowhead -> ^ + (char)0x02c6, (char)0x005e, // Modifier Letter Circumflex Accent -> ^ + (char)0x02c8, (char)0x0027, // Modifier Letter Vertical Line -> ' + (char)0x02c9, (char)0x003f, // Modifier Letter Macron + (char)0x02ca, (char)0x003f, // Modifier Letter Acute Accent + (char)0x02cb, (char)0x0060, // Modifier Letter Grave Accent -> ` + (char)0x02cd, (char)0x005f, // Modifier Letter Low Macron -> _ + (char)0x02da, (char)0x003f, // Ring Above + (char)0x02dc, (char)0x007e, // Small Tilde -> ~ + (char)0x0300, (char)0x0060, // Combining Grave Accent -> ` + (char)0x0302, (char)0x005e, // Combining Circumflex Accent -> ^ + (char)0x0303, (char)0x007e, // Combining Tilde -> ~ + (char)0x030e, (char)0x0022, // Combining Double Vertical Line Above -> " + (char)0x0331, (char)0x005f, // Combining Macron Below -> _ + (char)0x0332, (char)0x005f, // Combining Low Line -> _ + (char)0x2000, (char)0x0020, // En Quad + (char)0x2001, (char)0x0020, // Em Quad + (char)0x2002, (char)0x0020, // En Space + (char)0x2003, (char)0x0020, // Em Space + (char)0x2004, (char)0x0020, // Three-Per-Em Space + (char)0x2005, (char)0x0020, // Four-Per-Em Space + (char)0x2006, (char)0x0020, // Six-Per-Em Space + (char)0x2010, (char)0x002d, // Hyphen -> - + (char)0x2011, (char)0x002d, // Non-Breaking Hyphen -> - + (char)0x2013, (char)0x002d, // En Dash -> - + (char)0x2014, (char)0x002d, // Em Dash -> - + (char)0x2018, (char)0x0027, // Left Single Quotation Mark -> ' + (char)0x2019, (char)0x0027, // Right Single Quotation Mark -> ' + (char)0x201a, (char)0x002c, // Single Low-9 Quotation Mark -> , + (char)0x201c, (char)0x0022, // Left Double Quotation Mark -> " + (char)0x201d, (char)0x0022, // Right Double Quotation Mark -> " + (char)0x201e, (char)0x0022, // Double Low-9 Quotation Mark -> " + (char)0x2020, (char)0x003f, // Dagger + (char)0x2021, (char)0x003f, // Double Dagger + (char)0x2022, (char)0x002e, // Bullet -> . + (char)0x2026, (char)0x002e, // Horizontal Ellipsis -> . + (char)0x2030, (char)0x003f, // Per Mille Sign + (char)0x2032, (char)0x0027, // Prime -> ' + (char)0x2035, (char)0x0060, // Reversed Prime -> ` + (char)0x2039, (char)0x003c, // Single Left-Pointing Angle Quotation Mark -> < + (char)0x203a, (char)0x003e, // Single Right-Pointing Angle Quotation Mark -> > + (char)0x2122, (char)0x0054, // Trade Mark Sign -> T + (char)0xff01, (char)0x0021, // Fullwidth Exclamation Mark -> ! + (char)0xff02, (char)0x0022, // Fullwidth Quotation Mark -> " + (char)0xff03, (char)0x0023, // Fullwidth Number Sign -> # + (char)0xff04, (char)0x0024, // Fullwidth Dollar Sign -> $ + (char)0xff05, (char)0x0025, // Fullwidth Percent Sign -> % + (char)0xff06, (char)0x0026, // Fullwidth Ampersand -> & + (char)0xff07, (char)0x0027, // Fullwidth Apostrophe -> ' + (char)0xff08, (char)0x0028, // Fullwidth Left Parenthesis -> ( + (char)0xff09, (char)0x0029, // Fullwidth Right Parenthesis -> ) + (char)0xff0a, (char)0x002a, // Fullwidth Asterisk -> * + (char)0xff0b, (char)0x002b, // Fullwidth Plus Sign -> + + (char)0xff0c, (char)0x002c, // Fullwidth Comma -> , + (char)0xff0d, (char)0x002d, // Fullwidth Hyphen-Minus -> - + (char)0xff0e, (char)0x002e, // Fullwidth Full Stop -> . + (char)0xff0f, (char)0x002f, // Fullwidth Solidus -> / + (char)0xff10, (char)0x0030, // Fullwidth Digit Zero -> 0 + (char)0xff11, (char)0x0031, // Fullwidth Digit One -> 1 + (char)0xff12, (char)0x0032, // Fullwidth Digit Two -> 2 + (char)0xff13, (char)0x0033, // Fullwidth Digit Three -> 3 + (char)0xff14, (char)0x0034, // Fullwidth Digit Four -> 4 + (char)0xff15, (char)0x0035, // Fullwidth Digit Five -> 5 + (char)0xff16, (char)0x0036, // Fullwidth Digit Six -> 6 + (char)0xff17, (char)0x0037, // Fullwidth Digit Seven -> 7 + (char)0xff18, (char)0x0038, // Fullwidth Digit Eight -> 8 + (char)0xff19, (char)0x0039, // Fullwidth Digit Nine -> 9 + (char)0xff1a, (char)0x003a, // Fullwidth Colon -> : + (char)0xff1b, (char)0x003b, // Fullwidth Semicolon -> ; + (char)0xff1c, (char)0x003c, // Fullwidth Less-Than Sign -> < + (char)0xff1d, (char)0x003d, // Fullwidth Equals Sign -> = + (char)0xff1e, (char)0x003e, // Fullwidth Greater-Than Sign -> > + (char)0xff1f, (char)0x003f, // Fullwidth Question Mark + (char)0xff20, (char)0x0040, // Fullwidth Commercial At -> @ + (char)0xff21, (char)0x0041, // Fullwidth Latin Capital Letter A -> A + (char)0xff22, (char)0x0042, // Fullwidth Latin Capital Letter B -> B + (char)0xff23, (char)0x0043, // Fullwidth Latin Capital Letter C -> C + (char)0xff24, (char)0x0044, // Fullwidth Latin Capital Letter D -> D + (char)0xff25, (char)0x0045, // Fullwidth Latin Capital Letter E -> E + (char)0xff26, (char)0x0046, // Fullwidth Latin Capital Letter F -> F + (char)0xff27, (char)0x0047, // Fullwidth Latin Capital Letter G -> G + (char)0xff28, (char)0x0048, // Fullwidth Latin Capital Letter H -> H + (char)0xff29, (char)0x0049, // Fullwidth Latin Capital Letter I -> I + (char)0xff2a, (char)0x004a, // Fullwidth Latin Capital Letter J -> J + (char)0xff2b, (char)0x004b, // Fullwidth Latin Capital Letter K -> K + (char)0xff2c, (char)0x004c, // Fullwidth Latin Capital Letter L -> L + (char)0xff2d, (char)0x004d, // Fullwidth Latin Capital Letter M -> M + (char)0xff2e, (char)0x004e, // Fullwidth Latin Capital Letter N -> N + (char)0xff2f, (char)0x004f, // Fullwidth Latin Capital Letter O -> O + (char)0xff30, (char)0x0050, // Fullwidth Latin Capital Letter P -> P + (char)0xff31, (char)0x0051, // Fullwidth Latin Capital Letter Q -> Q + (char)0xff32, (char)0x0052, // Fullwidth Latin Capital Letter R -> R + (char)0xff33, (char)0x0053, // Fullwidth Latin Capital Letter S -> S + (char)0xff34, (char)0x0054, // Fullwidth Latin Capital Letter T -> T + (char)0xff35, (char)0x0055, // Fullwidth Latin Capital Letter U -> U + (char)0xff36, (char)0x0056, // Fullwidth Latin Capital Letter V -> V + (char)0xff37, (char)0x0057, // Fullwidth Latin Capital Letter W -> W + (char)0xff38, (char)0x0058, // Fullwidth Latin Capital Letter X -> X + (char)0xff39, (char)0x0059, // Fullwidth Latin Capital Letter Y -> Y + (char)0xff3a, (char)0x005a, // Fullwidth Latin Capital Letter Z -> Z + (char)0xff3b, (char)0x005b, // Fullwidth Left Square Bracket -> [ + (char)0xff3c, (char)0x005c, // Fullwidth Reverse Solidus -> \ + (char)0xff3d, (char)0x005d, // Fullwidth Right Square Bracket -> ] + (char)0xff3e, (char)0x005e, // Fullwidth Circumflex Accent -> ^ + (char)0xff3f, (char)0x005f, // Fullwidth Low Line -> _ + (char)0xff40, (char)0x0060, // Fullwidth Grave Accent -> ` + (char)0xff41, (char)0x0061, // Fullwidth Latin Small Letter A -> a + (char)0xff42, (char)0x0062, // Fullwidth Latin Small Letter B -> b + (char)0xff43, (char)0x0063, // Fullwidth Latin Small Letter C -> c + (char)0xff44, (char)0x0064, // Fullwidth Latin Small Letter D -> d + (char)0xff45, (char)0x0065, // Fullwidth Latin Small Letter E -> e + (char)0xff46, (char)0x0066, // Fullwidth Latin Small Letter F -> f + (char)0xff47, (char)0x0067, // Fullwidth Latin Small Letter G -> g + (char)0xff48, (char)0x0068, // Fullwidth Latin Small Letter H -> h + (char)0xff49, (char)0x0069, // Fullwidth Latin Small Letter I -> i + (char)0xff4a, (char)0x006a, // Fullwidth Latin Small Letter J -> j + (char)0xff4b, (char)0x006b, // Fullwidth Latin Small Letter K -> k + (char)0xff4c, (char)0x006c, // Fullwidth Latin Small Letter L -> l + (char)0xff4d, (char)0x006d, // Fullwidth Latin Small Letter M -> m + (char)0xff4e, (char)0x006e, // Fullwidth Latin Small Letter N -> n + (char)0xff4f, (char)0x006f, // Fullwidth Latin Small Letter O -> o + (char)0xff50, (char)0x0070, // Fullwidth Latin Small Letter P -> p + (char)0xff51, (char)0x0071, // Fullwidth Latin Small Letter Q -> q + (char)0xff52, (char)0x0072, // Fullwidth Latin Small Letter R -> r + (char)0xff53, (char)0x0073, // Fullwidth Latin Small Letter S -> s + (char)0xff54, (char)0x0074, // Fullwidth Latin Small Letter T -> t + (char)0xff55, (char)0x0075, // Fullwidth Latin Small Letter U -> u + (char)0xff56, (char)0x0076, // Fullwidth Latin Small Letter V -> v + (char)0xff57, (char)0x0077, // Fullwidth Latin Small Letter W -> w + (char)0xff58, (char)0x0078, // Fullwidth Latin Small Letter X -> x + (char)0xff59, (char)0x0079, // Fullwidth Latin Small Letter Y -> y + (char)0xff5a, (char)0x007a, // Fullwidth Latin Small Letter Z -> z + (char)0xff5b, (char)0x007b, // Fullwidth Left Curly Bracket -> { + (char)0xff5c, (char)0x007c, // Fullwidth Vertical Line -> | + (char)0xff5d, (char)0x007d, // Fullwidth Right Curly Bracket -> } + (char)0xff5e, (char)0x007e // Fullwidth Tilde -> ~ + }; + } +} diff --git a/src/mscorlib/shared/System/Text/UTF7Encoding.cs b/src/mscorlib/shared/System/Text/UTF7Encoding.cs new file mode 100644 index 0000000..0ac3b66 --- /dev/null +++ b/src/mscorlib/shared/System/Text/UTF7Encoding.cs @@ -0,0 +1,1041 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// +// Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. +// + +using System; +using System.Runtime.Serialization; +using System.Diagnostics; +using System.Diagnostics.Contracts; + +namespace System.Text +{ + public class UTF7Encoding : Encoding + { + private const String base64Chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + // 0123456789111111111122222222223333333333444444444455555555556666 + // 012345678901234567890123456789012345678901234567890123 + + // These are the characters that can be directly encoded in UTF7. + private const String directChars = + "\t\n\r '(),-./0123456789:?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + // These are the characters that can be optionally directly encoded in UTF7. + private const String optionalChars = + "!\"#$%&*;<=>@[]^_`{|}"; + + // Used by Encoding.UTF7 for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly UTF7Encoding s_default = new UTF7Encoding(); + + // The set of base 64 characters. + private byte[] base64Bytes; + // The decoded bits for every base64 values. This array has a size of 128 elements. + // The index is the code point value of the base 64 characters. The value is -1 if + // the code point is not a valid base 64 character. Otherwise, the value is a value + // from 0 ~ 63. + private sbyte[] base64Values; + // The array to decide if a Unicode code point below 0x80 can be directly encoded in UTF7. + // This array has a size of 128. + private bool[] directEncode; + + [OptionalField(VersionAdded = 2)] + private bool m_allowOptionals; + + private const int UTF7_CODEPAGE = 65000; + + + public UTF7Encoding() + : this(false) + { + } + + public UTF7Encoding(bool allowOptionals) + : base(UTF7_CODEPAGE) //Set the data item. + { + // Allowing optionals? + m_allowOptionals = allowOptionals; + + // Make our tables + MakeTables(); + } + + private void MakeTables() + { + // Build our tables + base64Bytes = new byte[64]; + for (int i = 0; i < 64; i++) base64Bytes[i] = (byte)base64Chars[i]; + base64Values = new sbyte[128]; + for (int i = 0; i < 128; i++) base64Values[i] = -1; + for (int i = 0; i < 64; i++) base64Values[base64Bytes[i]] = (sbyte)i; + directEncode = new bool[128]; + int count = directChars.Length; + for (int i = 0; i < count; i++) + { + directEncode[directChars[i]] = true; + } + + if (m_allowOptionals) + { + count = optionalChars.Length; + for (int i = 0; i < count; i++) + { + directEncode[optionalChars[i]] = true; + } + } + } + + // We go ahead and set this because Encoding expects it, however nothing can fall back in UTF7. + internal override void SetDefaultFallbacks() + { + // UTF7 had an odd decoderFallback behavior, and the Encoder fallback + // is irrelevant because we encode surrogates individually and never check for unmatched ones + // (so nothing can fallback during encoding) + this.encoderFallback = new EncoderReplacementFallback(String.Empty); + this.decoderFallback = new DecoderUTF7Fallback(); + } + + + [OnDeserializing] + private void OnDeserializing(StreamingContext ctx) + { + // make sure the optional fields initialized correctly. + base.OnDeserializing(); + } + + [OnDeserialized] + private void OnDeserialized(StreamingContext ctx) + { + base.OnDeserialized(); + + if (m_deserializedFromEverett) + { + // If 1st optional char is encoded we're allowing optionals + m_allowOptionals = directEncode[optionalChars[0]]; + } + + MakeTables(); + } + + + + public override bool Equals(Object value) + { + UTF7Encoding that = value as UTF7Encoding; + if (that != null) + { + return (m_allowOptionals == that.m_allowOptionals) && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + } + return (false); + } + + // Compared to all the other encodings, variations of UTF7 are unlikely + + public override int GetHashCode() + { + return this.CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode(); + } + + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(string s) + { + // Validate input + if (s==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = s) + return GetByteCount(pChars, s.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(string s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays. + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like empty arrays + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe String GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) + { + Debug.Assert(chars != null, "[UTF7Encoding.GetByteCount]chars!=null"); + Debug.Assert(count >= 0, "[UTF7Encoding.GetByteCount]count >=0"); + + // Just call GetBytes with bytes == null + return GetBytes(chars, count, null, 0, baseEncoder); + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS baseEncoder) + { + Debug.Assert(byteCount >= 0, "[UTF7Encoding.GetBytes]byteCount >=0"); + Debug.Assert(chars != null, "[UTF7Encoding.GetBytes]chars!=null"); + Debug.Assert(charCount >= 0, "[UTF7Encoding.GetBytes]charCount >=0"); + + // Get encoder info + UTF7Encoding.Encoder encoder = (UTF7Encoding.Encoder)baseEncoder; + + // Default bits & count + int bits = 0; + int bitCount = -1; + + // prepare our helpers + Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( + this, encoder, bytes, byteCount, chars, charCount); + + if (encoder != null) + { + bits = encoder.bits; + bitCount = encoder.bitCount; + + // May have had too many left over + while (bitCount >= 6) + { + bitCount -= 6; + // If we fail we'll never really have enough room + if (!buffer.AddByte(base64Bytes[(bits >> bitCount) & 0x3F])) + ThrowBytesOverflow(encoder, buffer.Count == 0); + } + } + + while (buffer.MoreData) + { + char currentChar = buffer.GetNextChar(); + + if (currentChar < 0x80 && directEncode[currentChar]) + { + if (bitCount >= 0) + { + if (bitCount > 0) + { + // Try to add the next byte + if (!buffer.AddByte(base64Bytes[bits << 6 - bitCount & 0x3F])) + break; // Stop here, didn't throw + + bitCount = 0; + } + + // Need to get emit '-' and our char, 2 bytes total + if (!buffer.AddByte((byte)'-')) + break; // Stop here, didn't throw + + bitCount = -1; + } + + // Need to emit our char + if (!buffer.AddByte((byte)currentChar)) + break; // Stop here, didn't throw + } + else if (bitCount < 0 && currentChar == '+') + { + if (!buffer.AddByte((byte)'+', (byte)'-')) + break; // Stop here, didn't throw + } + else + { + if (bitCount < 0) + { + // Need to emit a + and 12 bits (3 bytes) + // Only 12 of the 16 bits will be emitted this time, the other 4 wait 'til next time + if (!buffer.AddByte((byte)'+')) + break; // Stop here, didn't throw + + // We're now in bit mode, but haven't stored data yet + bitCount = 0; + } + + // Add our bits + bits = bits << 16 | currentChar; + bitCount += 16; + + while (bitCount >= 6) + { + bitCount -= 6; + if (!buffer.AddByte(base64Bytes[(bits >> bitCount) & 0x3F])) + { + bitCount += 6; // We didn't use these bits + currentChar = buffer.GetNextChar(); // We're processing this char still, but AddByte + // --'d it when we ran out of space + break; // Stop here, not enough room for bytes + } + } + + if (bitCount >= 6) + break; // Didn't have room to encode enough bits + } + } + + // Now if we have bits left over we have to encode them. + // MustFlush may have been cleared by encoding.ThrowBytesOverflow earlier if converting + if (bitCount >= 0 && (encoder == null || encoder.MustFlush)) + { + // Do we have bits we have to stick in? + if (bitCount > 0) + { + if (buffer.AddByte(base64Bytes[(bits << (6 - bitCount)) & 0x3F])) + { + // Emitted spare bits, 0 bits left + bitCount = 0; + } + } + + // If converting and failed bitCount above, then we'll fail this too + if (buffer.AddByte((byte)'-')) + { + // turned off bit mode'; + bits = 0; + bitCount = -1; + } + else + // If not successful, convert will maintain state for next time, also + // AddByte will have decremented our char count, however we need it to remain the same + buffer.GetNextChar(); + } + + // Do we have an encoder we're allowed to use? + // bytes == null if counting, so don't use encoder then + if (bytes != null && encoder != null) + { + // We already cleared bits & bitcount for mustflush case + encoder.bits = bits; + encoder.bitCount = bitCount; + encoder.m_charsUsed = buffer.CharsUsed; + } + + return buffer.Count; + } + + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + Debug.Assert(count >= 0, "[UTF7Encoding.GetCharCount]count >=0"); + Debug.Assert(bytes != null, "[UTF7Encoding.GetCharCount]bytes!=null"); + + // Just call GetChars with null char* to do counting + return GetChars(bytes, count, null, 0, baseDecoder); + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + Debug.Assert(byteCount >= 0, "[UTF7Encoding.GetChars]byteCount >=0"); + Debug.Assert(bytes != null, "[UTF7Encoding.GetChars]bytes!=null"); + Debug.Assert(charCount >= 0, "[UTF7Encoding.GetChars]charCount >=0"); + + // Might use a decoder + UTF7Encoding.Decoder decoder = (UTF7Encoding.Decoder)baseDecoder; + + // Get our output buffer info. + Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( + this, decoder, chars, charCount, bytes, byteCount); + + // Get decoder info + int bits = 0; + int bitCount = -1; + bool firstByte = false; + if (decoder != null) + { + bits = decoder.bits; + bitCount = decoder.bitCount; + firstByte = decoder.firstByte; + + Debug.Assert(firstByte == false || decoder.bitCount <= 0, + "[UTF7Encoding.GetChars]If remembered bits, then first byte flag shouldn't be set"); + } + + // We may have had bits in the decoder that we couldn't output last time, so do so now + if (bitCount >= 16) + { + // Check our decoder buffer + if (!buffer.AddChar((char)((bits >> (bitCount - 16)) & 0xFFFF))) + ThrowCharsOverflow(decoder, true); // Always throw, they need at least 1 char even in Convert + + // Used this one, clean up extra bits + bitCount -= 16; + } + + // Loop through the input + while (buffer.MoreData) + { + byte currentByte = buffer.GetNextByte(); + int c; + + if (bitCount >= 0) + { + // + // Modified base 64 encoding. + // + sbyte v; + if (currentByte < 0x80 && ((v = base64Values[currentByte]) >= 0)) + { + firstByte = false; + bits = (bits << 6) | ((byte)v); + bitCount += 6; + if (bitCount >= 16) + { + c = (bits >> (bitCount - 16)) & 0xFFFF; + bitCount -= 16; + } + // If not enough bits just continue + else continue; + } + else + { + // If it wasn't a base 64 byte, everything's going to turn off base 64 mode + bitCount = -1; + + if (currentByte != '-') + { + // >= 0x80 (because of 1st if statemtn) + // We need this check since the base64Values[b] check below need b <= 0x7f. + // This is not a valid base 64 byte. Terminate the shifted-sequence and + // emit this byte. + + // not in base 64 table + // According to the RFC 1642 and the example code of UTF-7 + // in Unicode 2.0, we should just zero-extend the invalid UTF7 byte + + // Chars won't be updated unless this works, try to fallback + if (!buffer.Fallback(currentByte)) + break; // Stop here, didn't throw + + // Used that byte, we're done with it + continue; + } + + // + // The encoding for '+' is "+-". + // + if (firstByte) c = '+'; + // We just turn it off if not emitting a +, so we're done. + else continue; + } + // + // End of modified base 64 encoding block. + // + } + else if (currentByte == '+') + { + // + // Found the start of a modified base 64 encoding block or a plus sign. + // + bitCount = 0; + firstByte = true; + continue; + } + else + { + // Normal character + if (currentByte >= 0x80) + { + // Try to fallback + if (!buffer.Fallback(currentByte)) + break; // Stop here, didn't throw + + // Done falling back + continue; + } + + // Use the normal character + c = currentByte; + } + + if (c >= 0) + { + // Check our buffer + if (!buffer.AddChar((char)c)) + { + // No room. If it was a plain char we'll try again later. + // Note, we'll consume this byte and stick it in decoder, even if we can't output it + if (bitCount >= 0) // Can we rememmber this byte (char) + { + buffer.AdjustBytes(+1); // Need to readd the byte that AddChar subtracted when it failed + bitCount += 16; // We'll still need that char we have in our bits + } + break; // didn't throw, stop + } + } + } + + // Stick stuff in the decoder if we can (chars == null if counting, so don't store decoder) + if (chars != null && decoder != null) + { + // MustFlush? (Could've been cleared by ThrowCharsOverflow if Convert & didn't reach end of buffer) + if (decoder.MustFlush) + { + // RFC doesn't specify what would happen if we have non-0 leftover bits, we just drop them + decoder.bits = 0; + decoder.bitCount = -1; + decoder.firstByte = false; + } + else + { + decoder.bits = bits; + decoder.bitCount = bitCount; + decoder.firstByte = firstByte; + } + decoder.m_bytesUsed = buffer.BytesUsed; + } + // else ignore any hanging bits. + + // Return our count + return buffer.Count; + } + + + public override System.Text.Decoder GetDecoder() + { + return new UTF7Encoding.Decoder(this); + } + + + public override System.Text.Encoder GetEncoder() + { + return new UTF7Encoding.Encoder(this); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Suppose that every char can not be direct-encoded, we know that + // a byte can encode 6 bits of the Unicode character. And we will + // also need two extra bytes for the shift-in ('+') and shift-out ('-') mark. + // Therefore, the max byte should be: + // byteCount = 2 + Math.Ceiling((double)charCount * 16 / 6); + // That is always <= 2 + 3 * charCount; + // Longest case is alternating encoded, direct, encoded data for 5 + 1 + 5... bytes per char. + // UTF7 doesn't have left over surrogates, but if no input we may need an output - to turn off + // encoding if MustFlush is true. + + // Its easiest to think of this as 2 bytes to turn on/off the base64 mode, then 3 bytes per char. + // 3 bytes is 18 bits of encoding, which is more than we need, but if its direct encoded then 3 + // bytes allows us to turn off and then back on base64 mode if necessary. + + // Note that UTF7 encoded surrogates individually and isn't worried about mismatches, so all + // code points are encodable int UTF7. + long byteCount = (long)charCount * 3 + 2; + + // check for overflow + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Worst case is 1 char per byte. Minimum 1 for left over bits in case decoder is being flushed + // Also note that we ignore extra bits (per spec), so UTF7 doesn't have unknown in this direction. + int charCount = byteCount; + if (charCount == 0) charCount = 1; + + return charCount; + } + + // Of all the amazing things... This MUST be Decoder so that our com name + // for System.Text.Decoder doesn't change + private sealed class Decoder : DecoderNLS, ISerializable + { + /*private*/ + internal int bits; + /*private*/ + internal int bitCount; + /*private*/ + internal bool firstByte; + + public Decoder(UTF7Encoding encoding) : base(encoding) + { + // base calls reset + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + public override void Reset() + { + this.bits = 0; + this.bitCount = -1; + this.firstByte = false; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our encoder? + internal override bool HasState + { + get + { + // NOTE: This forces the last -, which some encoder might not encode. If we + // don't see it we don't think we're done reading. + return (this.bitCount != -1); + } + } + } + + // Of all the amazing things... This MUST be Encoder so that our com name + // for System.Text.Encoder doesn't change + private sealed class Encoder : EncoderNLS, ISerializable + { + /*private*/ + internal int bits; + /*private*/ + internal int bitCount; + + public Encoder(UTF7Encoding encoding) : base(encoding) + { + // base calls reset + } + + // ISerializable implementation + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + throw new PlatformNotSupportedException(); + } + + public override void Reset() + { + this.bitCount = -1; + this.bits = 0; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our encoder? + internal override bool HasState + { + get + { + return (this.bits != 0 || this.bitCount != -1); + } + } + } + + // Preexisting UTF7 behavior for bad bytes was just to spit out the byte as the next char + // and turn off base64 mode if it was in that mode. We still exit the mode, but now we fallback. + private sealed class DecoderUTF7Fallback : DecoderFallback + { + // Construction. Default replacement fallback uses no best fit and ? replacement string + public DecoderUTF7Fallback() + { + } + + public override DecoderFallbackBuffer CreateFallbackBuffer() + { + return new DecoderUTF7FallbackBuffer(this); + } + + // Maximum number of characters that this instance of this fallback could return + public override int MaxCharCount + { + get + { + // returns 1 char per bad byte + return 1; + } + } + + public override bool Equals(Object value) + { + DecoderUTF7Fallback that = value as DecoderUTF7Fallback; + if (that != null) + { + return true; + } + return (false); + } + + public override int GetHashCode() + { + return 984; + } + } + + private sealed class DecoderUTF7FallbackBuffer : DecoderFallbackBuffer + { + // Store our default string + private char cFallback = (char)0; + private int iCount = -1; + private int iSize; + + // Construction + public DecoderUTF7FallbackBuffer(DecoderUTF7Fallback fallback) + { + } + + // Fallback Methods + public override bool Fallback(byte[] bytesUnknown, int index) + { + // We expect no previous fallback in our buffer + Debug.Assert(iCount < 0, "[DecoderUTF7FallbackBuffer.Fallback] Can't have recursive fallbacks"); + Debug.Assert(bytesUnknown.Length == 1, "[DecoderUTF7FallbackBuffer.Fallback] Only possible fallback case should be 1 unknown byte"); + + // Go ahead and get our fallback + cFallback = (char)bytesUnknown[0]; + + // Any of the fallback characters can be handled except for 0 + if (cFallback == 0) + { + return false; + } + + iCount = iSize = 1; + + return true; + } + + public override char GetNextChar() + { + if (iCount-- > 0) + return cFallback; + + // Note: this means that 0 in UTF7 stream will never be emitted. + return (char)0; + } + + public override bool MovePrevious() + { + if (iCount >= 0) + { + iCount++; + } + + // return true if we were allowed to do this + return (iCount >= 0 && iCount <= iSize); + } + + // Return # of chars left in this fallback + public override int Remaining + { + get + { + return (iCount > 0) ? iCount : 0; + } + } + + // Clear the buffer + public override unsafe void Reset() + { + iCount = -1; + byteStart = null; + } + + // This version just counts the fallback and doesn't actually copy anything. + internal unsafe override int InternalFallback(byte[] bytes, byte* pBytes) + // Right now this has both bytes and bytes[], since we might have extra bytes, hence the + // array, and we might need the index, hence the byte* + { + // We expect no previous fallback in our buffer + Debug.Assert(iCount < 0, "[DecoderUTF7FallbackBuffer.InternalFallback] Can't have recursive fallbacks"); + if (bytes.Length != 1) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex); + } + + // Can't fallback a byte 0, so return for that case, 1 otherwise. + return bytes[0] == 0 ? 0 : 1; + } + } + } +} -- 2.7.4