1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 ////////////////////////////////////////////////////////////////////////////
8 // Purpose: This Class defines behaviors specific to a writing system.
9 // A writing system is the collection of scripts and
10 // orthographic rules required to represent a language as text.
13 ////////////////////////////////////////////////////////////////////////////
15 using System.Diagnostics;
16 using System.Runtime.Serialization;
19 namespace System.Globalization
21 public partial class TextInfo : ICloneable, IDeserializationCallback
23 private enum Tristate : byte
30 private string _listSeparator;
31 private bool _isReadOnly = false;
33 /* _cultureName is the name of the creating culture.
34 _cultureData is the data that backs this class.
35 _textInfoName is the actual name of the textInfo (from cultureData.STEXTINFO)
36 In the desktop, when we call the sorting dll, it doesn't
37 know how to resolve custom locle names to sort ids so we have to have already resolved this.
40 private readonly string _cultureName; // Name of the culture that created this text info
41 private readonly CultureData _cultureData; // Data record for the culture that made us, not for this textinfo
42 private readonly string _textInfoName; // Name of the text info we're using (ie: _cultureData.STEXTINFO)
44 private Tristate _isAsciiCasingSameAsInvariant = Tristate.NotInitialized;
46 // _invariantMode is defined for the perf reason as accessing the instance field is faster than access the static property GlobalizationMode.Invariant
47 private readonly bool _invariantMode = GlobalizationMode.Invariant;
49 // Invariant text info
50 internal static TextInfo Invariant
54 if (s_Invariant == null)
55 s_Invariant = new TextInfo(CultureData.Invariant);
59 internal volatile static TextInfo s_Invariant;
61 //////////////////////////////////////////////////////////////////////////
63 //// TextInfo Constructors
65 //// Implements CultureInfo.TextInfo.
67 //////////////////////////////////////////////////////////////////////////
68 internal TextInfo(CultureData cultureData)
70 // This is our primary data source, we don't need most of the rest of this
71 _cultureData = cultureData;
72 _cultureName = _cultureData.CultureName;
73 _textInfoName = _cultureData.STEXTINFO;
75 FinishInitialization();
78 void IDeserializationCallback.OnDeserialization(Object sender)
80 throw new PlatformNotSupportedException();
84 // Internal ordinal comparison functions
87 internal static int GetHashCodeOrdinalIgnoreCase(string s)
89 // This is the same as an case insensitive hash for Invariant
90 // (not necessarily true for sorting, but OK for casing & then we apply normal hash code rules)
91 return Invariant.GetCaseInsensitiveHashCode(s);
94 public virtual int ANSICodePage => _cultureData.IDEFAULTANSICODEPAGE;
96 public virtual int OEMCodePage => _cultureData.IDEFAULTOEMCODEPAGE;
98 public virtual int MacCodePage => _cultureData.IDEFAULTMACCODEPAGE;
100 public virtual int EBCDICCodePage => _cultureData.IDEFAULTEBCDICCODEPAGE;
102 // Just use the LCID from our text info name
103 public int LCID => CultureInfo.GetCultureInfo(_textInfoName).LCID;
105 public string CultureName => _textInfoName;
107 public bool IsReadOnly => _isReadOnly;
109 //////////////////////////////////////////////////////////////////////////
113 //// Is the implementation of ICloneable.
115 //////////////////////////////////////////////////////////////////////////
116 public virtual object Clone()
118 object o = MemberwiseClone();
119 ((TextInfo)o).SetReadOnlyState(false);
123 ////////////////////////////////////////////////////////////////////////
127 // Create a cloned readonly instance or return the input one if it is
130 ////////////////////////////////////////////////////////////////////////
131 public static TextInfo ReadOnly(TextInfo textInfo)
133 if (textInfo == null) { throw new ArgumentNullException(nameof(textInfo)); }
134 if (textInfo.IsReadOnly) { return textInfo; }
136 TextInfo clonedTextInfo = (TextInfo)(textInfo.MemberwiseClone());
137 clonedTextInfo.SetReadOnlyState(true);
139 return clonedTextInfo;
142 private void VerifyWritable()
146 throw new InvalidOperationException(SR.InvalidOperation_ReadOnly);
150 internal void SetReadOnlyState(bool readOnly)
152 _isReadOnly = readOnly;
156 ////////////////////////////////////////////////////////////////////////
160 // Returns the string used to separate items in a list.
162 ////////////////////////////////////////////////////////////////////////
163 public virtual string ListSeparator
167 if (_listSeparator == null)
169 _listSeparator = _cultureData.SLIST;
171 return _listSeparator;
178 throw new ArgumentNullException(nameof(value), SR.ArgumentNull_String);
181 _listSeparator = value;
185 ////////////////////////////////////////////////////////////////////////
189 // Converts the character or string to lower case. Certain locales
190 // have different casing semantics from the file systems in Win32.
192 ////////////////////////////////////////////////////////////////////////
193 public unsafe virtual char ToLower(char c)
195 if (_invariantMode || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
197 return ToLowerAsciiInvariant(c);
200 return ChangeCase(c, toUpper: false);
203 public unsafe virtual string ToLower(string str)
205 if (str == null) { throw new ArgumentNullException(nameof(str)); }
209 return ToLowerAsciiInvariant(str);
212 return ChangeCase(str, toUpper: false);
215 private unsafe string ToLowerAsciiInvariant(string s)
222 fixed (char* pSource = s)
227 if ((uint)(pSource[i] - 'A') <= (uint)('Z' - 'A'))
239 string result = string.FastAllocateString(s.Length);
240 fixed (char* pResult = result)
242 for (int j = 0; j < i; j++)
244 pResult[j] = pSource[j];
247 pResult[i] = (char)(pSource[i] | 0x20);
252 pResult[i] = ToLowerAsciiInvariant(pSource[i]);
261 private unsafe string ToUpperAsciiInvariant(string s)
268 fixed (char* pSource = s)
273 if ((uint)(pSource[i] - 'a') <= (uint)('z' - 'a'))
285 string result = string.FastAllocateString(s.Length);
286 fixed (char* pResult = result)
288 for (int j = 0; j < i; j++)
290 pResult[j] = pSource[j];
293 pResult[i] = (char)(pSource[i] & ~0x20);
298 pResult[i] = ToUpperAsciiInvariant(pSource[i]);
307 private static char ToLowerAsciiInvariant(char c)
309 if ((uint)(c - 'A') <= (uint)('Z' - 'A'))
311 c = (char)(c | 0x20);
316 ////////////////////////////////////////////////////////////////////////
320 // Converts the character or string to upper case. Certain locales
321 // have different casing semantics from the file systems in Win32.
323 ////////////////////////////////////////////////////////////////////////
324 public unsafe virtual char ToUpper(char c)
326 if (_invariantMode || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
328 return ToUpperAsciiInvariant(c);
331 return ChangeCase(c, toUpper: true);
334 public unsafe virtual string ToUpper(string str)
336 if (str == null) { throw new ArgumentNullException(nameof(str)); }
340 return ToUpperAsciiInvariant(str);
343 return ChangeCase(str, toUpper: true);
346 internal static char ToUpperAsciiInvariant(char c)
348 if ((uint)(c - 'a') <= (uint)('z' - 'a'))
350 c = (char)(c & ~0x20);
355 private static bool IsAscii(char c)
360 private bool IsAsciiCasingSameAsInvariant
364 if (_isAsciiCasingSameAsInvariant == Tristate.NotInitialized)
366 _isAsciiCasingSameAsInvariant = CultureInfo.GetCultureInfo(_textInfoName).CompareInfo.Compare("abcdefghijklmnopqrstuvwxyz",
367 "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
368 CompareOptions.IgnoreCase) == 0 ? Tristate.True : Tristate.False;
370 return _isAsciiCasingSameAsInvariant == Tristate.True;
376 // Returns true if the dominant direction of text and UI such as the relative position of buttons and scroll bars
378 public bool IsRightToLeft => _cultureData.IsRightToLeft;
380 ////////////////////////////////////////////////////////////////////////
384 // Implements Object.Equals(). Returns a boolean indicating whether
385 // or not object refers to the same CultureInfo as the current instance.
387 ////////////////////////////////////////////////////////////////////////
388 public override bool Equals(Object obj)
390 TextInfo that = obj as TextInfo;
394 return CultureName.Equals(that.CultureName);
400 ////////////////////////////////////////////////////////////////////////
404 // Implements Object.GetHashCode(). Returns the hash code for the
405 // CultureInfo. The hash code is guaranteed to be the same for CultureInfo A
406 // and B where A.Equals(B) is true.
408 ////////////////////////////////////////////////////////////////////////
409 public override int GetHashCode()
411 return CultureName.GetHashCode();
414 ////////////////////////////////////////////////////////////////////////
418 // Implements Object.ToString(). Returns a string describing the
421 ////////////////////////////////////////////////////////////////////////
422 public override string ToString()
424 return "TextInfo - " + _cultureData.CultureName;
430 // Titlecasing refers to a casing practice wherein the first letter of a word is an uppercase letter
431 // and the rest of the letters are lowercase. The choice of which words to titlecase in headings
432 // and titles is dependent on language and local conventions. For example, "The Merry Wives of Windor"
433 // is the appropriate titlecasing of that play's name in English, with the word "of" not titlecased.
434 // In German, however, the title is "Die lustigen Weiber von Windsor," and both "lustigen" and "von"
435 // are not titlecased. In French even fewer words are titlecased: "Les joyeuses commeres de Windsor."
437 // Moreover, the determination of what actually constitutes a word is language dependent, and this can
438 // influence which letter or letters of a "word" are uppercased when titlecasing strings. For example
439 // "l'arbre" is considered two words in French, whereas "can't" is considered one word in English.
441 public unsafe string ToTitleCase(string str)
445 throw new ArgumentNullException(nameof(str));
452 StringBuilder result = new StringBuilder();
453 string lowercaseData = null;
454 // Store if the current culture is Dutch (special case)
455 bool isDutchCulture = CultureName.StartsWith("nl-", StringComparison.OrdinalIgnoreCase);
457 for (int i = 0; i < str.Length; i++)
459 UnicodeCategory charType;
462 charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
463 if (char.CheckLetter(charType))
465 // Special case to check for Dutch specific titlecasing with "IJ" characters
466 // at the beginning of a word
467 if (isDutchCulture && i < str.Length - 1 && (str[i] == 'i' || str[i] == 'I') && (str[i+1] == 'j' || str[i+1] == 'J'))
474 // Do the titlecasing for the first character of the word.
475 i = AddTitlecaseLetter(ref result, ref str, i, charLen) + 1;
479 // Convert the characters until the end of the this word
482 int lowercaseStart = i;
485 // Use hasLowerCase flag to prevent from lowercasing acronyms (like "URT", "USA", etc)
486 // This is in line with Word 2000 behavior of titlecasing.
488 bool hasLowerCase = (charType == UnicodeCategory.LowercaseLetter);
489 // Use a loop to find all of the other letters following this letter.
490 while (i < str.Length)
492 charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
493 if (IsLetterCategory(charType))
495 if (charType == UnicodeCategory.LowercaseLetter)
501 else if (str[i] == '\'')
506 if (lowercaseData == null)
508 lowercaseData = ToLower(str);
510 result.Append(lowercaseData, lowercaseStart, i - lowercaseStart);
514 result.Append(str, lowercaseStart, i - lowercaseStart);
519 else if (!IsWordSeparator(charType))
521 // This category is considered to be part of the word.
522 // This is any category that is marked as false in wordSeprator array.
527 // A word separator. Break out of the loop.
532 int count = i - lowercaseStart;
538 if (lowercaseData == null)
540 lowercaseData = ToLower(str);
542 result.Append(lowercaseData, lowercaseStart, count);
546 result.Append(str, lowercaseStart, count);
552 // not a letter, just append it
553 i = AddNonLetter(ref result, ref str, i, charLen);
558 // not a letter, just append it
559 i = AddNonLetter(ref result, ref str, i, charLen);
562 return result.ToString();
565 private static int AddNonLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
567 Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddNonLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
571 result.Append(input[inputIndex++]);
572 result.Append(input[inputIndex]);
576 result.Append(input[inputIndex]);
581 private int AddTitlecaseLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
583 Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddTitlecaseLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
585 // for surrogate pairs do a simple ToUpper operation on the substring
589 result.Append(ToUpper(input.Substring(inputIndex, charLen)));
594 switch (input[inputIndex])
597 // For AppCompat, the Titlecase Case Mapping data from NDP 2.0 is used below.
598 case (char) 0x01C4: // DZ with Caron -> Dz with Caron
599 case (char) 0x01C5: // Dz with Caron -> Dz with Caron
600 case (char) 0x01C6: // dz with Caron -> Dz with Caron
601 result.Append((char) 0x01C5);
603 case (char) 0x01C7: // LJ -> Lj
604 case (char) 0x01C8: // Lj -> Lj
605 case (char) 0x01C9: // lj -> Lj
606 result.Append((char) 0x01C8);
608 case (char) 0x01CA: // NJ -> Nj
609 case (char) 0x01CB: // Nj -> Nj
610 case (char) 0x01CC: // nj -> Nj
611 result.Append((char) 0x01CB);
613 case (char) 0x01F1: // DZ -> Dz
614 case (char) 0x01F2: // Dz -> Dz
615 case (char) 0x01F3: // dz -> Dz
616 result.Append((char) 0x01F2);
619 result.Append(ToUpper(input[inputIndex]));
627 // Used in ToTitleCase():
628 // When we find a starting letter, the following array decides if a category should be
629 // considered as word seprator or not.
631 private const int c_wordSeparatorMask =
632 /* false */ (0 << 0) | // UppercaseLetter = 0,
633 /* false */ (0 << 1) | // LowercaseLetter = 1,
634 /* false */ (0 << 2) | // TitlecaseLetter = 2,
635 /* false */ (0 << 3) | // ModifierLetter = 3,
636 /* false */ (0 << 4) | // OtherLetter = 4,
637 /* false */ (0 << 5) | // NonSpacingMark = 5,
638 /* false */ (0 << 6) | // SpacingCombiningMark = 6,
639 /* false */ (0 << 7) | // EnclosingMark = 7,
640 /* false */ (0 << 8) | // DecimalDigitNumber = 8,
641 /* false */ (0 << 9) | // LetterNumber = 9,
642 /* false */ (0 << 10) | // OtherNumber = 10,
643 /* true */ (1 << 11) | // SpaceSeparator = 11,
644 /* true */ (1 << 12) | // LineSeparator = 12,
645 /* true */ (1 << 13) | // ParagraphSeparator = 13,
646 /* true */ (1 << 14) | // Control = 14,
647 /* true */ (1 << 15) | // Format = 15,
648 /* false */ (0 << 16) | // Surrogate = 16,
649 /* false */ (0 << 17) | // PrivateUse = 17,
650 /* true */ (1 << 18) | // ConnectorPunctuation = 18,
651 /* true */ (1 << 19) | // DashPunctuation = 19,
652 /* true */ (1 << 20) | // OpenPunctuation = 20,
653 /* true */ (1 << 21) | // ClosePunctuation = 21,
654 /* true */ (1 << 22) | // InitialQuotePunctuation = 22,
655 /* true */ (1 << 23) | // FinalQuotePunctuation = 23,
656 /* true */ (1 << 24) | // OtherPunctuation = 24,
657 /* true */ (1 << 25) | // MathSymbol = 25,
658 /* true */ (1 << 26) | // CurrencySymbol = 26,
659 /* true */ (1 << 27) | // ModifierSymbol = 27,
660 /* true */ (1 << 28) | // OtherSymbol = 28,
661 /* false */ (0 << 29); // OtherNotAssigned = 29;
663 private static bool IsWordSeparator(UnicodeCategory category)
665 return (c_wordSeparatorMask & (1 << (int) category)) != 0;
668 private static bool IsLetterCategory(UnicodeCategory uc)
670 return (uc == UnicodeCategory.UppercaseLetter
671 || uc == UnicodeCategory.LowercaseLetter
672 || uc == UnicodeCategory.TitlecaseLetter
673 || uc == UnicodeCategory.ModifierLetter
674 || uc == UnicodeCategory.OtherLetter);
678 // Get case-insensitive hash code for the specified string.
680 internal unsafe int GetCaseInsensitiveHashCode(string str)
685 throw new ArgumentNullException(nameof(str));
688 // This code assumes that ASCII casing is safe for whatever context is passed in.
689 // this is true today, because we only ever call these methods on Invariant. It would be ideal to refactor
690 // these methods so they were correct by construction and we could only ever use Invariant.
695 // Note: We assume that str contains only ASCII characters until
696 // we hit a non-ASCII character to optimize the common case.
697 for (int i = 0; i < str.Length; i++)
702 return GetCaseInsensitiveHashCodeSlow(str);
705 // If we have a lowercase character, ANDing off 0x20
706 // will make it an uppercase character.
707 if ((c - 'a') <= ('z' - 'a'))
709 c = (uint)((int)c & ~0x20);
712 hash = ((hash << 5) + hash) ^ c;
718 private unsafe int GetCaseInsensitiveHashCodeSlow(string str)
720 Debug.Assert(str != null);
722 string upper = ToUpper(str);
727 for (int i = 0; i < upper.Length; i++)
730 hash = ((hash << 5) + hash) ^ c;