1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 ////////////////////////////////////////////////////////////////////////////
8 // Purpose: This Class defines behaviors specific to a writing system.
9 // A writing system is the collection of scripts and
10 // orthographic rules required to represent a language as text.
13 ////////////////////////////////////////////////////////////////////////////
15 using System.Diagnostics;
16 using System.Runtime.Serialization;
19 namespace System.Globalization
21 public partial class TextInfo : ICloneable, IDeserializationCallback
23 private enum Tristate : byte
30 private string _listSeparator;
31 private bool _isReadOnly = false;
33 /* _cultureName is the name of the creating culture.
34 _cultureData is the data that backs this class.
35 _textInfoName is the actual name of the textInfo (from cultureData.STEXTINFO)
36 In the desktop, when we call the sorting dll, it doesn't
37 know how to resolve custom locle names to sort ids so we have to have already resolved this.
40 private readonly string _cultureName; // Name of the culture that created this text info
41 private readonly CultureData _cultureData; // Data record for the culture that made us, not for this textinfo
42 private readonly string _textInfoName; // Name of the text info we're using (ie: _cultureData.STEXTINFO)
44 private Tristate _isAsciiCasingSameAsInvariant = Tristate.NotInitialized;
46 // _invariantMode is defined for the perf reason as accessing the instance field is faster than access the static property GlobalizationMode.Invariant
47 private readonly bool _invariantMode = GlobalizationMode.Invariant;
49 // Invariant text info
50 internal static TextInfo Invariant
54 if (s_Invariant == null)
55 s_Invariant = new TextInfo(CultureData.Invariant);
59 internal volatile static TextInfo s_Invariant;
61 //////////////////////////////////////////////////////////////////////////
63 //// TextInfo Constructors
65 //// Implements CultureInfo.TextInfo.
67 //////////////////////////////////////////////////////////////////////////
68 internal TextInfo(CultureData cultureData)
70 // This is our primary data source, we don't need most of the rest of this
71 _cultureData = cultureData;
72 _cultureName = _cultureData.CultureName;
73 _textInfoName = _cultureData.STEXTINFO;
75 FinishInitialization();
78 void IDeserializationCallback.OnDeserialization(Object sender)
80 throw new PlatformNotSupportedException();
84 // Internal ordinal comparison functions
87 internal static int GetHashCodeOrdinalIgnoreCase(string s)
89 // This is the same as an case insensitive hash for Invariant
90 // (not necessarily true for sorting, but OK for casing & then we apply normal hash code rules)
91 return Invariant.GetCaseInsensitiveHashCode(s);
94 // Currently we don't have native functions to do this, so we do it the hard way
95 internal static int IndexOfStringOrdinalIgnoreCase(string source, string value, int startIndex, int count)
97 if (count > source.Length || count < 0 || startIndex < 0 || startIndex > source.Length - count)
102 return CultureInfo.InvariantCulture.CompareInfo.IndexOfOrdinal(source, value, startIndex, count, ignoreCase: true);
105 // Currently we don't have native functions to do this, so we do it the hard way
106 internal static int LastIndexOfStringOrdinalIgnoreCase(string source, string value, int startIndex, int count)
108 if (count > source.Length || count < 0 || startIndex < 0 || startIndex > source.Length - 1 || (startIndex - count + 1 < 0))
113 return CultureInfo.InvariantCulture.CompareInfo.LastIndexOfOrdinal(source, value, startIndex, count, ignoreCase: true);
116 public virtual int ANSICodePage => _cultureData.IDEFAULTANSICODEPAGE;
118 public virtual int OEMCodePage => _cultureData.IDEFAULTOEMCODEPAGE;
120 public virtual int MacCodePage => _cultureData.IDEFAULTMACCODEPAGE;
122 public virtual int EBCDICCodePage => _cultureData.IDEFAULTEBCDICCODEPAGE;
124 // Just use the LCID from our text info name
125 public int LCID => CultureInfo.GetCultureInfo(_textInfoName).LCID;
127 public string CultureName => _textInfoName;
129 public bool IsReadOnly => _isReadOnly;
131 //////////////////////////////////////////////////////////////////////////
135 //// Is the implementation of ICloneable.
137 //////////////////////////////////////////////////////////////////////////
138 public virtual object Clone()
140 object o = MemberwiseClone();
141 ((TextInfo)o).SetReadOnlyState(false);
145 ////////////////////////////////////////////////////////////////////////
149 // Create a cloned readonly instance or return the input one if it is
152 ////////////////////////////////////////////////////////////////////////
153 public static TextInfo ReadOnly(TextInfo textInfo)
155 if (textInfo == null) { throw new ArgumentNullException(nameof(textInfo)); }
156 if (textInfo.IsReadOnly) { return textInfo; }
158 TextInfo clonedTextInfo = (TextInfo)(textInfo.MemberwiseClone());
159 clonedTextInfo.SetReadOnlyState(true);
161 return clonedTextInfo;
164 private void VerifyWritable()
168 throw new InvalidOperationException(SR.InvalidOperation_ReadOnly);
172 internal void SetReadOnlyState(bool readOnly)
174 _isReadOnly = readOnly;
178 ////////////////////////////////////////////////////////////////////////
182 // Returns the string used to separate items in a list.
184 ////////////////////////////////////////////////////////////////////////
185 public virtual string ListSeparator
189 if (_listSeparator == null)
191 _listSeparator = _cultureData.SLIST;
193 return _listSeparator;
200 throw new ArgumentNullException(nameof(value), SR.ArgumentNull_String);
203 _listSeparator = value;
207 ////////////////////////////////////////////////////////////////////////
211 // Converts the character or string to lower case. Certain locales
212 // have different casing semantics from the file systems in Win32.
214 ////////////////////////////////////////////////////////////////////////
215 public unsafe virtual char ToLower(char c)
217 if (_invariantMode || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
219 return ToLowerAsciiInvariant(c);
222 return ChangeCase(c, toUpper: false);
225 public unsafe virtual string ToLower(string str)
227 if (str == null) { throw new ArgumentNullException(nameof(str)); }
231 return ToLowerAsciiInvariant(str);
234 return ChangeCase(str, toUpper: false);
237 private unsafe string ToLowerAsciiInvariant(string s)
244 fixed (char* pSource = s)
249 if ((uint)(pSource[i] - 'A') <= (uint)('Z' - 'A'))
261 string result = string.FastAllocateString(s.Length);
262 fixed (char* pResult = result)
264 for (int j = 0; j < i; j++)
266 pResult[j] = pSource[j];
269 pResult[i] = (char)(pSource[i] | 0x20);
274 pResult[i] = ToLowerAsciiInvariant(pSource[i]);
283 private unsafe string ToUpperAsciiInvariant(string s)
290 fixed (char* pSource = s)
295 if ((uint)(pSource[i] - 'a') <= (uint)('z' - 'a'))
307 string result = string.FastAllocateString(s.Length);
308 fixed (char* pResult = result)
310 for (int j = 0; j < i; j++)
312 pResult[j] = pSource[j];
315 pResult[i] = (char)(pSource[i] & ~0x20);
320 pResult[i] = ToUpperAsciiInvariant(pSource[i]);
329 private static char ToLowerAsciiInvariant(char c)
331 if ((uint)(c - 'A') <= (uint)('Z' - 'A'))
333 c = (char)(c | 0x20);
338 ////////////////////////////////////////////////////////////////////////
342 // Converts the character or string to upper case. Certain locales
343 // have different casing semantics from the file systems in Win32.
345 ////////////////////////////////////////////////////////////////////////
346 public unsafe virtual char ToUpper(char c)
348 if (_invariantMode || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
350 return ToUpperAsciiInvariant(c);
353 return ChangeCase(c, toUpper: true);
356 public unsafe virtual string ToUpper(string str)
358 if (str == null) { throw new ArgumentNullException(nameof(str)); }
362 return ToUpperAsciiInvariant(str);
365 return ChangeCase(str, toUpper: true);
368 internal static char ToUpperAsciiInvariant(char c)
370 if ((uint)(c - 'a') <= (uint)('z' - 'a'))
372 c = (char)(c & ~0x20);
377 private static bool IsAscii(char c)
382 private bool IsAsciiCasingSameAsInvariant
386 if (_isAsciiCasingSameAsInvariant == Tristate.NotInitialized)
388 _isAsciiCasingSameAsInvariant = CultureInfo.GetCultureInfo(_textInfoName).CompareInfo.Compare("abcdefghijklmnopqrstuvwxyz",
389 "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
390 CompareOptions.IgnoreCase) == 0 ? Tristate.True : Tristate.False;
392 return _isAsciiCasingSameAsInvariant == Tristate.True;
398 // Returns true if the dominant direction of text and UI such as the relative position of buttons and scroll bars
400 public bool IsRightToLeft => _cultureData.IsRightToLeft;
402 ////////////////////////////////////////////////////////////////////////
406 // Implements Object.Equals(). Returns a boolean indicating whether
407 // or not object refers to the same CultureInfo as the current instance.
409 ////////////////////////////////////////////////////////////////////////
410 public override bool Equals(Object obj)
412 TextInfo that = obj as TextInfo;
416 return CultureName.Equals(that.CultureName);
422 ////////////////////////////////////////////////////////////////////////
426 // Implements Object.GetHashCode(). Returns the hash code for the
427 // CultureInfo. The hash code is guaranteed to be the same for CultureInfo A
428 // and B where A.Equals(B) is true.
430 ////////////////////////////////////////////////////////////////////////
431 public override int GetHashCode()
433 return CultureName.GetHashCode();
436 ////////////////////////////////////////////////////////////////////////
440 // Implements Object.ToString(). Returns a string describing the
443 ////////////////////////////////////////////////////////////////////////
444 public override string ToString()
446 return "TextInfo - " + _cultureData.CultureName;
452 // Titlecasing refers to a casing practice wherein the first letter of a word is an uppercase letter
453 // and the rest of the letters are lowercase. The choice of which words to titlecase in headings
454 // and titles is dependent on language and local conventions. For example, "The Merry Wives of Windor"
455 // is the appropriate titlecasing of that play's name in English, with the word "of" not titlecased.
456 // In German, however, the title is "Die lustigen Weiber von Windsor," and both "lustigen" and "von"
457 // are not titlecased. In French even fewer words are titlecased: "Les joyeuses commeres de Windsor."
459 // Moreover, the determination of what actually constitutes a word is language dependent, and this can
460 // influence which letter or letters of a "word" are uppercased when titlecasing strings. For example
461 // "l'arbre" is considered two words in French, whereas "can't" is considered one word in English.
463 public unsafe string ToTitleCase(string str)
467 throw new ArgumentNullException(nameof(str));
474 StringBuilder result = new StringBuilder();
475 string lowercaseData = null;
476 // Store if the current culture is Dutch (special case)
477 bool isDutchCulture = CultureName.StartsWith("nl-", StringComparison.OrdinalIgnoreCase);
479 for (int i = 0; i < str.Length; i++)
481 UnicodeCategory charType;
484 charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
485 if (char.CheckLetter(charType))
487 // Special case to check for Dutch specific titlecasing with "IJ" characters
488 // at the beginning of a word
489 if (isDutchCulture && i < str.Length - 1 && (str[i] == 'i' || str[i] == 'I') && (str[i+1] == 'j' || str[i+1] == 'J'))
496 // Do the titlecasing for the first character of the word.
497 i = AddTitlecaseLetter(ref result, ref str, i, charLen) + 1;
501 // Convert the characters until the end of the this word
504 int lowercaseStart = i;
507 // Use hasLowerCase flag to prevent from lowercasing acronyms (like "URT", "USA", etc)
508 // This is in line with Word 2000 behavior of titlecasing.
510 bool hasLowerCase = (charType == UnicodeCategory.LowercaseLetter);
511 // Use a loop to find all of the other letters following this letter.
512 while (i < str.Length)
514 charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
515 if (IsLetterCategory(charType))
517 if (charType == UnicodeCategory.LowercaseLetter)
523 else if (str[i] == '\'')
528 if (lowercaseData == null)
530 lowercaseData = ToLower(str);
532 result.Append(lowercaseData, lowercaseStart, i - lowercaseStart);
536 result.Append(str, lowercaseStart, i - lowercaseStart);
541 else if (!IsWordSeparator(charType))
543 // This category is considered to be part of the word.
544 // This is any category that is marked as false in wordSeprator array.
549 // A word separator. Break out of the loop.
554 int count = i - lowercaseStart;
560 if (lowercaseData == null)
562 lowercaseData = ToLower(str);
564 result.Append(lowercaseData, lowercaseStart, count);
568 result.Append(str, lowercaseStart, count);
574 // not a letter, just append it
575 i = AddNonLetter(ref result, ref str, i, charLen);
580 // not a letter, just append it
581 i = AddNonLetter(ref result, ref str, i, charLen);
584 return result.ToString();
587 private static int AddNonLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
589 Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddNonLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
593 result.Append(input[inputIndex++]);
594 result.Append(input[inputIndex]);
598 result.Append(input[inputIndex]);
603 private int AddTitlecaseLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
605 Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddTitlecaseLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
607 // for surrogate pairs do a simple ToUpper operation on the substring
611 result.Append(ToUpper(input.Substring(inputIndex, charLen)));
616 switch (input[inputIndex])
619 // For AppCompat, the Titlecase Case Mapping data from NDP 2.0 is used below.
620 case (char) 0x01C4: // DZ with Caron -> Dz with Caron
621 case (char) 0x01C5: // Dz with Caron -> Dz with Caron
622 case (char) 0x01C6: // dz with Caron -> Dz with Caron
623 result.Append((char) 0x01C5);
625 case (char) 0x01C7: // LJ -> Lj
626 case (char) 0x01C8: // Lj -> Lj
627 case (char) 0x01C9: // lj -> Lj
628 result.Append((char) 0x01C8);
630 case (char) 0x01CA: // NJ -> Nj
631 case (char) 0x01CB: // Nj -> Nj
632 case (char) 0x01CC: // nj -> Nj
633 result.Append((char) 0x01CB);
635 case (char) 0x01F1: // DZ -> Dz
636 case (char) 0x01F2: // Dz -> Dz
637 case (char) 0x01F3: // dz -> Dz
638 result.Append((char) 0x01F2);
641 result.Append(ToUpper(input[inputIndex]));
649 // Used in ToTitleCase():
650 // When we find a starting letter, the following array decides if a category should be
651 // considered as word seprator or not.
653 private const int c_wordSeparatorMask =
654 /* false */ (0 << 0) | // UppercaseLetter = 0,
655 /* false */ (0 << 1) | // LowercaseLetter = 1,
656 /* false */ (0 << 2) | // TitlecaseLetter = 2,
657 /* false */ (0 << 3) | // ModifierLetter = 3,
658 /* false */ (0 << 4) | // OtherLetter = 4,
659 /* false */ (0 << 5) | // NonSpacingMark = 5,
660 /* false */ (0 << 6) | // SpacingCombiningMark = 6,
661 /* false */ (0 << 7) | // EnclosingMark = 7,
662 /* false */ (0 << 8) | // DecimalDigitNumber = 8,
663 /* false */ (0 << 9) | // LetterNumber = 9,
664 /* false */ (0 << 10) | // OtherNumber = 10,
665 /* true */ (1 << 11) | // SpaceSeparator = 11,
666 /* true */ (1 << 12) | // LineSeparator = 12,
667 /* true */ (1 << 13) | // ParagraphSeparator = 13,
668 /* true */ (1 << 14) | // Control = 14,
669 /* true */ (1 << 15) | // Format = 15,
670 /* false */ (0 << 16) | // Surrogate = 16,
671 /* false */ (0 << 17) | // PrivateUse = 17,
672 /* true */ (1 << 18) | // ConnectorPunctuation = 18,
673 /* true */ (1 << 19) | // DashPunctuation = 19,
674 /* true */ (1 << 20) | // OpenPunctuation = 20,
675 /* true */ (1 << 21) | // ClosePunctuation = 21,
676 /* true */ (1 << 22) | // InitialQuotePunctuation = 22,
677 /* true */ (1 << 23) | // FinalQuotePunctuation = 23,
678 /* true */ (1 << 24) | // OtherPunctuation = 24,
679 /* true */ (1 << 25) | // MathSymbol = 25,
680 /* true */ (1 << 26) | // CurrencySymbol = 26,
681 /* true */ (1 << 27) | // ModifierSymbol = 27,
682 /* true */ (1 << 28) | // OtherSymbol = 28,
683 /* false */ (0 << 29); // OtherNotAssigned = 29;
685 private static bool IsWordSeparator(UnicodeCategory category)
687 return (c_wordSeparatorMask & (1 << (int) category)) != 0;
690 private static bool IsLetterCategory(UnicodeCategory uc)
692 return (uc == UnicodeCategory.UppercaseLetter
693 || uc == UnicodeCategory.LowercaseLetter
694 || uc == UnicodeCategory.TitlecaseLetter
695 || uc == UnicodeCategory.ModifierLetter
696 || uc == UnicodeCategory.OtherLetter);
700 // Get case-insensitive hash code for the specified string.
702 internal unsafe int GetCaseInsensitiveHashCode(string str)
707 throw new ArgumentNullException(nameof(str));
710 // This code assumes that ASCII casing is safe for whatever context is passed in.
711 // this is true today, because we only ever call these methods on Invariant. It would be ideal to refactor
712 // these methods so they were correct by construction and we could only ever use Invariant.
717 // Note: We assume that str contains only ASCII characters until
718 // we hit a non-ASCII character to optimize the common case.
719 for (int i = 0; i < str.Length; i++)
724 return GetCaseInsensitiveHashCodeSlow(str);
727 // If we have a lowercase character, ANDing off 0x20
728 // will make it an uppercase character.
729 if ((c - 'a') <= ('z' - 'a'))
731 c = (uint)((int)c & ~0x20);
734 hash = ((hash << 5) + hash) ^ c;
740 private unsafe int GetCaseInsensitiveHashCodeSlow(string str)
742 Debug.Assert(str != null);
744 string upper = ToUpper(str);
749 for (int i = 0; i < upper.Length; i++)
752 hash = ((hash << 5) + hash) ^ c;