1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the QtCore module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
42 // Don't define it while compiling this module, or USERS of Qt will
43 // not be able to link.
44 #ifdef QT_NO_CAST_FROM_ASCII
45 # undef QT_NO_CAST_FROM_ASCII
47 #ifdef QT_NO_CAST_TO_ASCII
48 # undef QT_NO_CAST_TO_ASCII
52 #include "qdatastream.h"
54 #include "qunicodetables_p.h"
55 #include "qunicodetables.cpp"
59 #define FLAG(x) (1 << (x))
63 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
65 \ingroup string-processing
67 This class is only useful to construct a QChar with 8-bit character.
69 \sa QChar, QLatin1String, QString
73 \fn const char QLatin1Char::toLatin1() const
75 Converts a Latin-1 character to an 8-bit ASCII representation of the character.
79 \fn const ushort QLatin1Char::unicode() const
81 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
86 \fn QLatin1Char::QLatin1Char(char c)
88 Constructs a Latin-1 character for \a c. This constructor should be
89 used when the encoding of the input character is known to be Latin-1.
94 \brief The QChar class provides a 16-bit Unicode character.
96 \ingroup string-processing
99 In Qt, Unicode characters are 16-bit entities without any markup
100 or structure. This class represents such an entity. It is
101 lightweight, so it can be used everywhere. Most compilers treat
102 it like a \c{unsigned short}.
104 QChar provides a full complement of testing/classification
105 functions, converting to and from other formats, converting from
106 composed to decomposed Unicode, and trying to compare and
107 case-convert if you ask it to.
109 The classification functions include functions like those in the
110 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
111 operating on the full range of Unicode characters, not just for the ASCII
112 range. They all return true if the character is a certain type of character;
113 otherwise they return false. These classification functions are
114 isNull() (returns true if the character is '\\0'), isPrint()
115 (true if the character is any sort of printable character,
116 including whitespace), isPunct() (any sort of punctation),
117 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
118 sort of numeric character, not just 0-9), isLetterOrNumber(), and
119 isDigit() (decimal digits). All of these are wrappers around
120 category() which return the Unicode-defined category of each
121 character. Some of these also calculate the derived properties
122 (i.e. isSpace() returns true if the character is of category
123 Separator_* or an exceptional code point from Other_Control category).
125 QChar also provides direction(), which indicates the "natural"
126 writing direction of this character. The joining() function
127 indicates how the character joins with it's neighbors (needed
128 mostly for Arabic) and finally hasMirrored(), which indicates
129 whether the character needs to be mirrored when it is printed in
130 it's "unnatural" writing direction.
132 Composed Unicode characters (like \a ring) can be converted to
133 decomposed Unicode ("a" followed by "ring above") by using decomposition().
135 In Unicode, comparison is not necessarily possible and case
136 conversion is very difficult at best. Unicode, covering the
137 "entire" world, also includes most of the world's case and
138 sorting problems. operator==() and friends will do comparison
139 based purely on the numeric Unicode value (code point) of the
140 characters, and toUpper() and toLower() will do case changes when
141 the character has a well-defined uppercase/lowercase equivalent.
142 For locale-dependent comparisons, use QString::localeAwareCompare().
144 The conversion functions include unicode() (to a scalar),
145 toLatin1() (to scalar, but converts all non-Latin-1 characters to
146 0), row() (gives the Unicode row), cell() (gives the Unicode
147 cell), digitValue() (gives the integer value of any of the
148 numerous digit characters), and a host of constructors.
150 QChar provides constructors and cast operators that make it easy
151 to convert to and from traditional 8-bit \c{char}s. If you
152 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
153 explained in the QString documentation, you will need to
154 explicitly call fromLatin1(), or use QLatin1Char,
155 to construct a QChar from an 8-bit \c char, and you will need to
156 call toLatin1() to get the 8-bit value back.
158 For more information see
159 \l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
161 \sa Unicode, QString, QLatin1Char
165 \enum QChar::UnicodeVersion
167 Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
168 introduced a certain character.
170 \value Unicode_1_1 Version 1.1
171 \value Unicode_2_0 Version 2.0
172 \value Unicode_2_1_2 Version 2.1.2
173 \value Unicode_3_0 Version 3.0
174 \value Unicode_3_1 Version 3.1
175 \value Unicode_3_2 Version 3.2
176 \value Unicode_4_0 Version 4.0
177 \value Unicode_4_1 Version 4.1
178 \value Unicode_5_0 Version 5.0
179 \value Unicode_5_1 Version 5.1
180 \value Unicode_5_2 Version 5.2
181 \value Unicode_6_0 Version 6.0
182 \value Unicode_6_1 Version 6.1
183 \value Unicode_Unassigned The value is not assigned to any character
184 in version 6.1 of Unicode.
186 \sa unicodeVersion(), currentUnicodeVersion()
190 \enum QChar::Category
192 This enum maps the Unicode character categories.
194 The following characters are normative in Unicode:
196 \value Mark_NonSpacing Unicode class name Mn
198 \value Mark_SpacingCombining Unicode class name Mc
200 \value Mark_Enclosing Unicode class name Me
202 \value Number_DecimalDigit Unicode class name Nd
204 \value Number_Letter Unicode class name Nl
206 \value Number_Other Unicode class name No
208 \value Separator_Space Unicode class name Zs
210 \value Separator_Line Unicode class name Zl
212 \value Separator_Paragraph Unicode class name Zp
214 \value Other_Control Unicode class name Cc
216 \value Other_Format Unicode class name Cf
218 \value Other_Surrogate Unicode class name Cs
220 \value Other_PrivateUse Unicode class name Co
222 \value Other_NotAssigned Unicode class name Cn
225 The following categories are informative in Unicode:
227 \value Letter_Uppercase Unicode class name Lu
229 \value Letter_Lowercase Unicode class name Ll
231 \value Letter_Titlecase Unicode class name Lt
233 \value Letter_Modifier Unicode class name Lm
235 \value Letter_Other Unicode class name Lo
237 \value Punctuation_Connector Unicode class name Pc
239 \value Punctuation_Dash Unicode class name Pd
241 \value Punctuation_Open Unicode class name Ps
243 \value Punctuation_Close Unicode class name Pe
245 \value Punctuation_InitialQuote Unicode class name Pi
247 \value Punctuation_FinalQuote Unicode class name Pf
249 \value Punctuation_Other Unicode class name Po
251 \value Symbol_Math Unicode class name Sm
253 \value Symbol_Currency Unicode class name Sc
255 \value Symbol_Modifier Unicode class name Sk
257 \value Symbol_Other Unicode class name So
263 \enum QChar::Direction
265 This enum type defines the Unicode direction attributes. See the
266 \l{http://www.unicode.org/}{Unicode Standard} for a description
269 In order to conform to C/C++ naming conventions "Dir" is prepended
270 to the codes used in the Unicode Standard.
296 \enum QChar::Decomposition
298 This enum type defines the Unicode decomposition attributes. See
299 the \l{http://www.unicode.org/}{Unicode Standard} for a
300 description of the values.
302 \value NoDecomposition
327 This enum type defines the Unicode joining attributes. See the
328 \l{http://www.unicode.org/}{Unicode Standard} for a description
340 \enum QChar::CombiningClass
344 This enum type defines names for some of the Unicode combining
345 classes. See the \l{http://www.unicode.org/}{Unicode Standard}
346 for a description of the values.
348 \value Combining_Above
349 \value Combining_AboveAttached
350 \value Combining_AboveLeft
351 \value Combining_AboveLeftAttached
352 \value Combining_AboveRight
353 \value Combining_AboveRightAttached
354 \value Combining_Below
355 \value Combining_BelowAttached
356 \value Combining_BelowLeft
357 \value Combining_BelowLeftAttached
358 \value Combining_BelowRight
359 \value Combining_BelowRightAttached
360 \value Combining_DoubleAbove
361 \value Combining_DoubleBelow
362 \value Combining_IotaSubscript
363 \value Combining_Left
364 \value Combining_LeftAttached
365 \value Combining_Right
366 \value Combining_RightAttached
370 \enum QChar::SpecialCharacter
372 \value Null A QChar with this value isNull().
373 \value Tabulation Character tabulation.
375 \value CarriageReturn
377 \value Nbsp Non-breaking space.
378 \value ReplacementCharacter The character shown when a font has no glyph
379 for a certain codepoint. A special question mark character is often
380 used. Codecs use this codepoint when input data cannot be
381 represented in Unicode.
382 \value ObjectReplacementCharacter Used to represent an object such as an
383 image when such objects cannot be presented.
385 \value ByteOrderSwapped
386 \value ParagraphSeparator
388 \value LastValidCodePoint
392 \fn void QChar::setCell(uchar cell)
397 \fn void QChar::setRow(uchar row)
404 Constructs a null QChar ('\\0').
410 \fn QChar::QChar(QLatin1Char ch)
412 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
416 \fn QChar::QChar(SpecialCharacter ch)
418 Constructs a QChar for the predefined character value \a ch.
422 \fn QChar::QChar(char ch)
424 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
428 \fn QChar::QChar(uchar ch)
430 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
434 \fn QChar::QChar(uchar cell, uchar row)
436 Constructs a QChar for Unicode cell \a cell in row \a row.
442 \fn QChar::QChar(ushort code)
444 Constructs a QChar for the character with Unicode code point \a code.
448 \fn QChar::QChar(short code)
450 Constructs a QChar for the character with Unicode code point \a code.
454 \fn QChar::QChar(uint code)
456 Constructs a QChar for the character with Unicode code point \a code.
460 \fn QChar::QChar(int code)
462 Constructs a QChar for the character with Unicode code point \a code.
466 \fn bool QChar::isNull() const
468 Returns true if the character is the Unicode character 0x0000
469 ('\\0'); otherwise returns false.
473 \fn uchar QChar::cell() const
475 Returns the cell (least significant byte) of the Unicode character.
481 \fn uchar QChar::row() const
483 Returns the row (most significant byte) of the Unicode character.
489 \fn bool QChar::isPrint() const
491 Returns true if the character is a printable character; otherwise
492 returns false. This is any character not of category Other_*.
494 Note that this gives no indication of whether the character is
495 available in a particular font.
502 Returns true if the UCS-4-encoded character specified by \a ucs4 is
503 a printable character; otherwise returns false.
504 This is any character not of category Other_*.
506 Note that this gives no indication of whether the character is
507 available in a particular font.
509 bool QChar::isPrint(uint ucs4)
511 if (ucs4 > LastValidCodePoint)
513 const int test = FLAG(Other_Control) |
515 FLAG(Other_Surrogate) |
516 FLAG(Other_PrivateUse) |
517 FLAG(Other_NotAssigned);
518 return !(FLAG(qGetProp(ucs4)->category) & test);
522 \fn bool QChar::isSpace() const
524 Returns true if the character is a separator character
525 (Separator_* categories or certain code points from Other_Control category);
526 otherwise returns false.
530 \fn bool QChar::isSpace(uint ucs4)
534 Returns true if the UCS-4-encoded character specified by \a ucs4 is
535 a separator character (Separator_* categories or certain code points
536 from Other_Control category); otherwise returns false.
542 bool QT_FASTCALL QChar::isSpace_helper(uint ucs4)
544 if (ucs4 > LastValidCodePoint)
546 const int test = FLAG(Separator_Space) |
547 FLAG(Separator_Line) |
548 FLAG(Separator_Paragraph);
549 return FLAG(qGetProp(ucs4)->category) & test;
553 \fn bool QChar::isMark() const
555 Returns true if the character is a mark (Mark_* categories);
556 otherwise returns false.
558 See QChar::Category for more information regarding marks.
565 Returns true if the UCS-4-encoded character specified by \a ucs4 is
566 a mark (Mark_* categories); otherwise returns false.
568 bool QChar::isMark(uint ucs4)
570 if (ucs4 > LastValidCodePoint)
572 const int test = FLAG(Mark_NonSpacing) |
573 FLAG(Mark_SpacingCombining) |
574 FLAG(Mark_Enclosing);
575 return FLAG(qGetProp(ucs4)->category) & test;
579 \fn bool QChar::isPunct() const
581 Returns true if the character is a punctuation mark (Punctuation_*
582 categories); otherwise returns false.
589 Returns true if the UCS-4-encoded character specified by \a ucs4 is
590 a punctuation mark (Punctuation_* categories); otherwise returns false.
592 bool QChar::isPunct(uint ucs4)
594 if (ucs4 > LastValidCodePoint)
596 const int test = FLAG(Punctuation_Connector) |
597 FLAG(Punctuation_Dash) |
598 FLAG(Punctuation_Open) |
599 FLAG(Punctuation_Close) |
600 FLAG(Punctuation_InitialQuote) |
601 FLAG(Punctuation_FinalQuote) |
602 FLAG(Punctuation_Other);
603 return FLAG(qGetProp(ucs4)->category) & test;
607 \fn bool QChar::isSymbol() const
609 Returns true if the character is a symbol (Symbol_* categories);
610 otherwise returns false.
617 Returns true if the UCS-4-encoded character specified by \a ucs4 is
618 a symbol (Symbol_* categories); otherwise returns false.
620 bool QChar::isSymbol(uint ucs4)
622 if (ucs4 > LastValidCodePoint)
624 const int test = FLAG(Symbol_Math) |
625 FLAG(Symbol_Currency) |
626 FLAG(Symbol_Modifier) |
628 return FLAG(qGetProp(ucs4)->category) & test;
632 \fn bool QChar::isLetter() const
634 Returns true if the character is a letter (Letter_* categories);
635 otherwise returns false.
639 \fn bool QChar::isLetter(uint ucs4)
643 Returns true if the UCS-4-encoded character specified by \a ucs4 is
644 a letter (Letter_* categories); otherwise returns false.
650 bool QT_FASTCALL QChar::isLetter_helper(uint ucs4)
652 if (ucs4 > LastValidCodePoint)
654 const int test = FLAG(Letter_Uppercase) |
655 FLAG(Letter_Lowercase) |
656 FLAG(Letter_Titlecase) |
657 FLAG(Letter_Modifier) |
659 return FLAG(qGetProp(ucs4)->category) & test;
663 \fn bool QChar::isNumber() const
665 Returns true if the character is a number (Number_* categories,
666 not just 0-9); otherwise returns false.
672 \fn bool QChar::isNumber(uint ucs4)
676 Returns true if the UCS-4-encoded character specified by \a ucs4 is
677 a number (Number_* categories, not just 0-9); otherwise returns false.
685 bool QT_FASTCALL QChar::isNumber_helper(uint ucs4)
687 if (ucs4 > LastValidCodePoint)
689 const int test = FLAG(Number_DecimalDigit) |
690 FLAG(Number_Letter) |
692 return FLAG(qGetProp(ucs4)->category) & test;
696 \fn bool QChar::isLetterOrNumber() const
698 Returns true if the character is a letter or number (Letter_* or
699 Number_* categories); otherwise returns false.
703 \fn bool QChar::isLetterOrNumber(uint ucs4)
707 Returns true if the UCS-4-encoded character specified by \a ucs4 is
708 a letter or number (Letter_* or Number_* categories); otherwise returns false.
714 bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4)
716 if (ucs4 > LastValidCodePoint)
718 const int test = FLAG(Letter_Uppercase) |
719 FLAG(Letter_Lowercase) |
720 FLAG(Letter_Titlecase) |
721 FLAG(Letter_Modifier) |
723 FLAG(Number_DecimalDigit) |
724 FLAG(Number_Letter) |
726 return FLAG(qGetProp(ucs4)->category) & test;
730 \fn bool QChar::isDigit() const
732 Returns true if the character is a decimal digit
733 (Number_DecimalDigit); otherwise returns false.
739 \fn bool QChar::isDigit(uint ucs4)
743 Returns true if the UCS-4-encoded character specified by \a ucs4 is
744 a decimal digit (Number_DecimalDigit); otherwise returns false.
750 \fn bool QChar::isNonCharacter() const
753 Returns true if the QChar is a non-character; false otherwise.
755 Unicode has a certain number of code points that are classified
756 as "non-characters:" that is, they can be used for internal purposes
757 in applications but cannot be used for text interchange.
758 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
759 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
763 \fn bool QChar::isHighSurrogate() const
765 Returns true if the QChar is the high part of a UTF16 surrogate
766 (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise.
770 \fn bool QChar::isLowSurrogate() const
772 Returns true if the QChar is the low part of a UTF16 surrogate
773 (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise.
777 \fn bool QChar::isSurrogate() const
780 Returns true if the QChar contains a code point that is in either
781 the high or the low part of the UTF-16 surrogate range
782 (i.e. if its code point is in range [0xd800..0xdfff]); false otherwise.
786 \fn static bool isNonCharacter(uint ucs4)
790 Returns true if the UCS-4-encoded character specified by \a ucs4
791 is a non-character; false otherwise.
793 Unicode has a certain number of code points that are classified
794 as "non-characters:" that is, they can be used for internal purposes
795 in applications but cannot be used for text interchange.
796 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
797 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
801 \fn static bool QChar::isHighSurrogate(uint ucs4)
804 Returns true if the UCS-4-encoded character specified by \a ucs4
805 is the high part of a UTF16 surrogate
806 (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise.
810 \fn static bool QChar::isLowSurrogate(uint ucs4)
813 Returns true if the UCS-4-encoded character specified by \a ucs4
814 is the low part of a UTF16 surrogate
815 (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise.
819 \fn static bool QChar::isSurrogate(uint ucs4)
823 Returns true if the UCS-4-encoded character specified by \a ucs4
824 contains a code point that is in either the high or the low part of the
825 UTF-16 surrogate range (i.e. if its code point is in range [0xd800..0xdfff]);
830 \fn static bool QChar::requiresSurrogates(uint ucs4)
832 Returns true if the UCS-4-encoded character specified by \a ucs4
833 can be split into the high and low parts of a UTF16 surrogate
834 (i.e. if its code point is greater than or equals to 0x10000);
839 \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
841 Converts a UTF16 surrogate pair with the given \a high and \a low values
842 to it's UCS-4-encoded code point.
846 \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
849 Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
853 \fn static ushort QChar::highSurrogate(uint ucs4)
855 Returns the high surrogate part of a UCS-4-encoded code point.
856 The returned result is undefined if \a ucs4 is smaller than 0x10000.
860 \fn static ushort QChar::lowSurrogate(uint ucs4)
862 Returns the low surrogate part of a UCS-4-encoded code point.
863 The returned result is undefined if \a ucs4 is smaller than 0x10000.
867 \fn int QChar::digitValue() const
869 Returns the numeric value of the digit, or -1 if the character is not a digit.
874 Returns the numeric value of the digit specified by the UCS-4-encoded
875 character, \a ucs4, or -1 if the character is not a digit.
877 int QChar::digitValue(uint ucs4)
879 if (ucs4 > LastValidCodePoint)
881 return qGetProp(ucs4)->digitValue;
885 \fn QChar::Category QChar::category() const
887 Returns the character's category.
892 Returns the category of the UCS-4-encoded character specified by \a ucs4.
894 QChar::Category QChar::category(uint ucs4)
896 if (ucs4 > LastValidCodePoint)
897 return QChar::Other_NotAssigned;
898 return (QChar::Category) qGetProp(ucs4)->category;
902 \fn QChar::Direction QChar::direction() const
904 Returns the character's direction.
909 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
911 QChar::Direction QChar::direction(uint ucs4)
913 if (ucs4 > LastValidCodePoint)
915 return (QChar::Direction) qGetProp(ucs4)->direction;
919 \fn QChar::Joining QChar::joining() const
921 Returns information about the joining properties of the character
922 (needed for certain languages such as Arabic).
927 Returns information about the joining properties of the UCS-4-encoded
928 character specified by \a ucs4 (needed for certain languages such as Arabic).
930 QChar::Joining QChar::joining(uint ucs4)
932 if (ucs4 > LastValidCodePoint)
933 return QChar::OtherJoining;
934 return (QChar::Joining) qGetProp(ucs4)->joining;
938 \fn bool QChar::hasMirrored() const
940 Returns true if the character should be reversed if the text
941 direction is reversed; otherwise returns false.
943 A bit faster equivalent of (ch.mirroredChar() != ch).
952 Returns true if the UCS-4-encoded character specified by \a ucs4
953 should be reversed if the text direction is reversed; otherwise returns false.
955 A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
959 bool QChar::hasMirrored(uint ucs4)
961 if (ucs4 > LastValidCodePoint)
963 return qGetProp(ucs4)->mirrorDiff != 0;
967 \fn bool QChar::isLower() const
969 Returns true if the character is a lowercase letter, i.e.
970 category() is Letter_Lowercase.
972 \sa isUpper(), toLower(), toUpper()
976 \fn bool QChar::isUpper() const
978 Returns true if the character is an uppercase letter, i.e.
979 category() is Letter_Uppercase.
981 \sa isLower(), toUpper(), toLower()
985 \fn bool QChar::isTitleCase() const
987 Returns true if the character is a titlecase letter, i.e.
988 category() is Letter_Titlecase.
990 \sa isLower(), toUpper(), toLower(), toTitleCase()
994 \fn QChar QChar::mirroredChar() const
996 Returns the mirrored character if this character is a mirrored
997 character; otherwise returns the character itself.
1004 Returns the mirrored character if the UCS-4-encoded character specified
1005 by \a ucs4 is a mirrored character; otherwise returns the character itself.
1009 uint QChar::mirroredChar(uint ucs4)
1011 if (ucs4 > LastValidCodePoint)
1013 return ucs4 + qGetProp(ucs4)->mirrorDiff;
1017 // constants for Hangul (de)composition, see UAX #15
1019 Hangul_SBase = 0xac00,
1020 Hangul_LBase = 0x1100,
1021 Hangul_VBase = 0x1161,
1022 Hangul_TBase = 0x11a7,
1026 Hangul_NCount = Hangul_VCount * Hangul_TCount,
1027 Hangul_SCount = Hangul_LCount * Hangul_NCount
1030 // buffer has to have a length of 3. It's needed for Hangul decomposition
1031 static const unsigned short * QT_FASTCALL decompositionHelper
1032 (uint ucs4, int *length, int *tag, unsigned short *buffer)
1034 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1035 // compute Hangul syllable decomposition as per UAX #15
1036 const uint SIndex = ucs4 - Hangul_SBase;
1037 buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
1038 buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1039 buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
1040 *length = buffer[2] == Hangul_TBase ? 2 : 3;
1041 *tag = QChar::Canonical;
1045 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1046 if (index == 0xffff) {
1048 *tag = QChar::NoDecomposition;
1052 const unsigned short *decomposition = uc_decomposition_map+index;
1053 *tag = (*decomposition) & 0xff;
1054 *length = (*decomposition) >> 8;
1055 return decomposition+1;
1059 Decomposes a character into it's constituent parts. Returns an empty string
1060 if no decomposition exists.
1062 QString QChar::decomposition() const
1064 return QChar::decomposition(ucs);
1069 Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1070 constituent parts. Returns an empty string if no decomposition exists.
1072 QString QChar::decomposition(uint ucs4)
1074 unsigned short buffer[3];
1077 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1078 return QString(reinterpret_cast<const QChar *>(d), length);
1082 \fn QChar::Decomposition QChar::decompositionTag() const
1084 Returns the tag defining the composition of the character. Returns
1085 QChar::NoDecomposition if no decomposition exists.
1090 Returns the tag defining the composition of the UCS-4-encoded character
1091 specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1093 QChar::Decomposition QChar::decompositionTag(uint ucs4)
1095 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1096 return QChar::Canonical;
1097 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1098 if (index == 0xffff)
1099 return QChar::NoDecomposition;
1100 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1104 \fn unsigned char QChar::combiningClass() const
1106 Returns the combining class for the character as defined in the
1107 Unicode standard. This is mainly useful as a positioning hint for
1108 marks attached to a base character.
1110 The Qt text rendering engine uses this information to correctly
1111 position non-spacing marks around a base character.
1116 Returns the combining class for the UCS-4-encoded character specified by
1117 \a ucs4, as defined in the Unicode standard.
1119 unsigned char QChar::combiningClass(uint ucs4)
1121 if (ucs4 > LastValidCodePoint)
1123 return (unsigned char) qGetProp(ucs4)->combiningClass;
1127 \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1129 Returns the Unicode version that introduced this character.
1134 Returns the Unicode version that introduced the character specified in
1135 its UCS-4-encoded form as \a ucs4.
1137 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
1139 if (ucs4 > LastValidCodePoint)
1140 return QChar::Unicode_Unassigned;
1141 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1145 Returns the most recent supported Unicode version.
1147 QChar::UnicodeVersion QChar::currentUnicodeVersion()
1149 return UNICODE_DATA_VERSION;
1153 template <typename T>
1154 static inline T toLowerCase_helper(T uc)
1156 const QUnicodeTables::Properties *p = qGetProp(uc);
1157 if (p->lowerCaseSpecial) {
1158 const ushort *specialCase = specialCaseMap + p->lowerCaseDiff;
1159 return (*specialCase == 1) ? specialCase[1] : uc;
1161 return uc + p->lowerCaseDiff;
1164 template <typename T>
1165 static inline T toUpperCase_helper(T uc)
1167 const QUnicodeTables::Properties *p = qGetProp(uc);
1168 if (p->upperCaseSpecial) {
1169 const ushort *specialCase = specialCaseMap + p->upperCaseDiff;
1170 return (*specialCase == 1) ? specialCase[1] : uc;
1172 return uc + p->upperCaseDiff;
1175 template <typename T>
1176 static inline T toTitleCase_helper(T uc)
1178 const QUnicodeTables::Properties *p = qGetProp(uc);
1179 if (p->titleCaseSpecial) {
1180 const ushort *specialCase = specialCaseMap + p->titleCaseDiff;
1181 return (*specialCase == 1) ? specialCase[1] : uc;
1183 return uc + p->titleCaseDiff;
1186 template <typename T>
1187 static inline T toCaseFolded_helper(T uc)
1189 const QUnicodeTables::Properties *p = qGetProp(uc);
1190 if (p->caseFoldSpecial) {
1191 const ushort *specialCase = specialCaseMap + p->caseFoldDiff;
1192 return (*specialCase == 1) ? specialCase[1] : uc;
1194 return uc + p->caseFoldDiff;
1198 \fn QChar QChar::toLower() const
1200 Returns the lowercase equivalent if the character is uppercase or titlecase;
1201 otherwise returns the character itself.
1206 Returns the lowercase equivalent of the UCS-4-encoded character specified
1207 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1208 the character itself.
1210 uint QChar::toLower(uint ucs4)
1212 if (ucs4 > LastValidCodePoint)
1214 return toLowerCase_helper<uint>(ucs4);
1218 \fn QChar QChar::toUpper() const
1220 Returns the uppercase equivalent if the character is lowercase or titlecase;
1221 otherwise returns the character itself.
1226 Returns the uppercase equivalent of the UCS-4-encoded character specified
1227 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1228 the character itself.
1230 uint QChar::toUpper(uint ucs4)
1232 if (ucs4 > LastValidCodePoint)
1234 return toUpperCase_helper<uint>(ucs4);
1238 \fn QChar QChar::toTitleCase() const
1240 Returns the title case equivalent if the character is lowercase or uppercase;
1241 otherwise returns the character itself.
1246 Returns the title case equivalent of the UCS-4-encoded character specified
1247 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1248 the character itself.
1250 uint QChar::toTitleCase(uint ucs4)
1252 if (ucs4 > LastValidCodePoint)
1254 return toTitleCase_helper<uint>(ucs4);
1257 static inline uint foldCase(const ushort *ch, const ushort *start)
1260 if (QChar(c).isLowSurrogate() && ch > start && QChar(*(ch - 1)).isHighSurrogate())
1261 c = QChar::surrogateToUcs4(*(ch - 1), c);
1262 return toCaseFolded_helper<uint>(c);
1265 static inline uint foldCase(uint ch, uint &last)
1268 if (QChar(c).isLowSurrogate() && QChar(last).isHighSurrogate())
1269 c = QChar::surrogateToUcs4(last, c);
1271 return toCaseFolded_helper<uint>(c);
1274 static inline ushort foldCase(ushort ch)
1276 return toCaseFolded_helper<ushort>(ch);
1280 \fn QChar QChar::toCaseFolded() const
1282 Returns the case folded equivalent of the character.
1283 For most Unicode characters this is the same as toLowerCase().
1288 Returns the case folded equivalent of the UCS-4-encoded character specified
1289 by \a ucs4. For most Unicode characters this is the same as toLowerCase().
1291 uint QChar::toCaseFolded(uint ucs4)
1293 if (ucs4 > LastValidCodePoint)
1295 return toCaseFolded_helper<uint>(ucs4);
1299 \fn char QChar::toLatin1() const
1301 Returns the Latin-1 character equivalent to the QChar, or 0. This
1302 is mainly useful for non-internationalized software.
1304 \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1305 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1311 \fn QChar QChar::fromLatin1(char)
1313 Converts the Latin-1 character \a c to its equivalent QChar. This
1314 is mainly useful for non-internationalized software.
1316 An alternative is to use QLatin1Char.
1318 \sa toLatin1(), unicode()
1322 \fn char QChar::toAscii() const
1325 Returns the Latin-1 character value of the QChar, or 0 if the character is not
1328 The main purpose of this function is to preserve ASCII characters used
1329 in C strings. This is mainly useful for developers of non-internationalized
1332 \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1333 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1335 \note This function does not check whether the character value is inside
1336 the valid range of US-ASCII.
1338 \sa toLatin1(), unicode()
1342 \fn QChar QChar::fromAscii(char)
1345 Converts the ASCII character \a c to it's equivalent QChar. This
1346 is mainly useful for non-internationalized software.
1348 An alternative is to use QLatin1Char.
1350 \sa fromLatin1(), unicode()
1353 #ifndef QT_NO_DATASTREAM
1357 Writes the char \a chr to the stream \a out.
1359 \sa {Serializing Qt Data Types}
1361 QDataStream &operator<<(QDataStream &out, QChar chr)
1363 out << quint16(chr.unicode());
1370 Reads a char from the stream \a in into char \a chr.
1372 \sa {Serializing Qt Data Types}
1374 QDataStream &operator>>(QDataStream &in, QChar &chr)
1378 chr.unicode() = ushort(u);
1381 #endif // QT_NO_DATASTREAM
1384 \fn ushort & QChar::unicode()
1386 Returns a reference to the numeric Unicode value of the QChar.
1390 \fn ushort QChar::unicode() const
1392 Returns the numeric Unicode value of the QChar.
1395 /*****************************************************************************
1396 Documentation of QChar related functions
1397 *****************************************************************************/
1400 \fn bool operator==(QChar c1, QChar c2)
1404 Returns true if \a c1 and \a c2 are the same Unicode character;
1405 otherwise returns false.
1409 \fn int operator!=(QChar c1, QChar c2)
1413 Returns true if \a c1 and \a c2 are not the same Unicode
1414 character; otherwise returns false.
1418 \fn int operator<=(QChar c1, QChar c2)
1422 Returns true if the numeric Unicode value of \a c1 is less than
1423 or equal to that of \a c2; otherwise returns false.
1427 \fn int operator>=(QChar c1, QChar c2)
1431 Returns true if the numeric Unicode value of \a c1 is greater than
1432 or equal to that of \a c2; otherwise returns false.
1436 \fn int operator<(QChar c1, QChar c2)
1440 Returns true if the numeric Unicode value of \a c1 is less than
1441 that of \a c2; otherwise returns false.
1445 \fn int operator>(QChar c1, QChar c2)
1449 Returns true if the numeric Unicode value of \a c1 is greater than
1450 that of \a c2; otherwise returns false.
1454 // ---------------------------------------------------------------------------
1457 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1461 unsigned short buffer[3];
1465 const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1466 const unsigned short *uc = utf16 + s.length();
1467 while (uc != utf16 + from) {
1468 uint ucs4 = *(--uc);
1469 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1470 ushort high = *(uc - 1);
1471 if (QChar(high).isHighSurrogate()) {
1473 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1477 if (QChar::unicodeVersion(ucs4) > version)
1480 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1481 if (!d || (canonical && tag != QChar::Canonical))
1484 int pos = uc - utf16;
1485 s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1486 // since the replace invalidates the pointers and we do decomposition recursive
1487 utf16 = reinterpret_cast<unsigned short *>(s.data());
1488 uc = utf16 + pos + length;
1498 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1499 { return u1 < ligature.u1; }
1500 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1501 { return ligature.u1 < u1; }
1503 struct UCS2SurrogatePair {
1508 inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1509 { return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); }
1510 inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1511 { return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; }
1513 static uint inline ligatureHelper(uint u1, uint u2)
1515 if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) {
1516 // compute Hangul syllable composition as per UAX #15
1518 const uint LIndex = u1 - Hangul_LBase;
1519 if (LIndex < Hangul_LCount) {
1520 const uint VIndex = u2 - Hangul_VBase;
1521 if (VIndex < Hangul_VCount)
1522 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1525 const uint SIndex = u1 - Hangul_SBase;
1526 if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1527 const uint TIndex = u2 - Hangul_TBase;
1528 if (TIndex <= Hangul_TCount)
1533 const unsigned short index = GET_LIGATURE_INDEX(u2);
1534 if (index == 0xffff)
1536 const unsigned short *ligatures = uc_ligature_map+index;
1537 ushort length = *ligatures++;
1538 if (QChar::requiresSurrogates(u1)) {
1539 const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1540 const UCS2SurrogatePair *r = qBinaryFind(data, data + length, u1);
1541 if (r != data + length)
1542 return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2);
1544 const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1545 const UCS2Pair *r = qBinaryFind(data, data + length, ushort(u1));
1546 if (r != data + length)
1553 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1557 if (from < 0 || s.length() - from < 2)
1560 int starter = 0; // starter position
1561 uint stcode = 0; // starter code point
1563 int lastCombining = 0;
1566 while (pos < s.length()) {
1568 uint uc = s.at(pos).unicode();
1569 if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1570 ushort low = s.at(pos+1).unicode();
1571 if (QChar(low).isLowSurrogate()) {
1572 uc = QChar::surrogateToUcs4(uc, low);
1577 const QUnicodeTables::Properties *p = qGetProp(uc);
1578 if (p->unicodeVersion > version) {
1580 next = -1; // to prevent i == next
1581 lastCombining = 255; // to prevent combining > lastCombining
1586 int combining = p->combiningClass;
1587 if (i == next || combining > lastCombining) {
1588 Q_ASSERT(starter >= from);
1589 // allowed to form ligature with S
1590 uint ligature = ligatureHelper(stcode, uc);
1593 QChar *d = s.data();
1594 // ligatureHelper() never changes planes
1595 if (QChar::requiresSurrogates(ligature)) {
1596 d[starter] = QChar::highSurrogate(ligature);
1597 d[starter + 1] = QChar::lowSurrogate(ligature);
1600 d[starter] = ligature;
1606 if (combining == 0) {
1611 lastCombining = combining;
1618 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1621 const int l = s.length()-1;
1629 u1 = s.at(pos).unicode();
1630 if (QChar(u1).isHighSurrogate()) {
1631 ushort low = s.at(p2).unicode();
1632 if (QChar(low).isLowSurrogate()) {
1633 u1 = QChar::surrogateToUcs4(u1, low);
1642 u2 = s.at(p2).unicode();
1643 if (QChar(u2).isHighSurrogate() && p2 < l) {
1644 ushort low = s.at(p2+1).unicode();
1645 if (QChar(low).isLowSurrogate()) {
1646 u2 = QChar::surrogateToUcs4(u2, low);
1653 const QUnicodeTables::Properties *p = qGetProp(u2);
1654 if (p->unicodeVersion <= version)
1655 c2 = p->combiningClass;
1663 const QUnicodeTables::Properties *p = qGetProp(u1);
1664 if (p->unicodeVersion <= version)
1665 c1 = p->combiningClass;
1669 QChar *uc = s.data();
1671 // exchange characters
1672 if (!QChar::requiresSurrogates(u2)) {
1675 uc[p++] = QChar::highSurrogate(u2);
1676 uc[p++] = QChar::lowSurrogate(u2);
1678 if (!QChar::requiresSurrogates(u1)) {
1681 uc[p++] = QChar::highSurrogate(u1);
1682 uc[p++] = QChar::lowSurrogate(u1);
1686 if (pos > 0 && s.at(pos).isLowSurrogate())
1690 if (QChar::requiresSurrogates(u1))
1696 if (QChar::requiresSurrogates(u1))