src/corelib/tools/qchar.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 // Don't define it while compiling this module, or USERS of Qt will
  43 // not be able to link.
  44 #ifdef QT_NO_CAST_FROM_ASCII
  45 #  undef QT_NO_CAST_FROM_ASCII
  46 #endif
  47 #ifdef QT_NO_CAST_TO_ASCII
  48 #  undef QT_NO_CAST_TO_ASCII
  49 #endif
  50 #include "qchar.h"
  51
  52 #include "qdatastream.h"
  53 #include "qtextcodec.h"
  54
  55 #include "qunicodetables_p.h"
  56 #include "qunicodetables.cpp"
  57
  58 QT_BEGIN_NAMESPACE
  59
  60 #ifndef QT_NO_CODEC_FOR_C_STRINGS
  61 #  ifdef QT_NO_TEXTCODEC
  62 #    define QT_NO_CODEC_FOR_C_STRINGS
  63 #  endif
  64 #endif
  65
  66 #define FLAG(x) (1 << (x))
  67
  68 /*!
  69     \class QLatin1Char
  70     \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
  71
  72     \ingroup string-processing
  73
  74     This class is only useful to avoid the codec for C strings business
  75     in the QChar(ch) constructor. You can avoid it by writing QChar(ch, 0).
  76
  77     \sa QChar, QLatin1String, QString
  78 */
  79
  80 /*!
  81     \fn const char QLatin1Char::toLatin1() const
  82
  83     Converts a Latin-1 character to an 8-bit ASCII representation of the character.
  84 */
  85
  86 /*!
  87     \fn const ushort QLatin1Char::unicode() const
  88
  89     Converts a Latin-1 character to an 16-bit-encoded Unicode representation
  90     of the character.
  91 */
  92
  93 /*!
  94     \fn QLatin1Char::QLatin1Char(char c)
  95
  96     Constructs a Latin-1 character for \a c. This constructor should be
  97     used when the encoding of the input character is known to be Latin-1.
  98 */
  99
 100 /*!
 101     \class QChar
 102     \brief The QChar class provides a 16-bit Unicode character.
 103
 104     \ingroup string-processing
 105     \reentrant
 106
 107     In Qt, Unicode characters are 16-bit entities without any markup
 108     or structure. This class represents such an entity. It is
 109     lightweight, so it can be used everywhere. Most compilers treat
 110     it like a \c{unsigned short}.
 111
 112     QChar provides a full complement of testing/classification
 113     functions, converting to and from other formats, converting from
 114     composed to decomposed Unicode, and trying to compare and
 115     case-convert if you ask it to.
 116
 117     The classification functions include functions like those in the
 118     standard C++ header \<cctype\> (formerly \<ctype.h\>), but
 119     operating on the full range of Unicode characters. They all
 120     return true if the character is a certain type of character;
 121     otherwise they return false. These classification functions are
 122     isNull() (returns true if the character is '\\0'), isPrint()
 123     (true if the character is any sort of printable character,
 124     including whitespace), isPunct() (any sort of punctation),
 125     isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
 126     sort of numeric character, not just 0-9), isLetterOrNumber(), and
 127     isDigit() (decimal digits). All of these are wrappers around
 128     category() which return the Unicode-defined category of each
 129     character.
 130
 131     QChar also provides direction(), which indicates the "natural"
 132     writing direction of this character. The joining() function
 133     indicates how the character joins with it's neighbors (needed
 134     mostly for Arabic) and finally hasMirrored(), which indicates
 135     whether the character needs to be mirrored when it is printed in
 136     it's "unnatural" writing direction.
 137
 138     Composed Unicode characters (like \a ring) can be converted to
 139     decomposed Unicode ("a" followed by "ring above") by using decomposition().
 140
 141     In Unicode, comparison is not necessarily possible and case
 142     conversion is very difficult at best. Unicode, covering the
 143     "entire" world, also includes most of the world's case and
 144     sorting problems. operator==() and friends will do comparison
 145     based purely on the numeric Unicode value (code point) of the
 146     characters, and toUpper() and toLower() will do case changes when
 147     the character has a well-defined uppercase/lowercase equivalent.
 148     For locale-dependent comparisons, use QString::localeAwareCompare().
 149
 150     The conversion functions include unicode() (to a scalar),
 151     toLatin1() (to scalar, but converts all non-Latin-1 characters to
 152     0), row() (gives the Unicode row), cell() (gives the Unicode
 153     cell), digitValue() (gives the integer value of any of the
 154     numerous digit characters), and a host of constructors.
 155
 156     QChar provides constructors and cast operators that make it easy
 157     to convert to and from traditional 8-bit \c{char}s. If you
 158     defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
 159     explained in the QString documentation, you will need to
 160     explicitly call fromAscii() or fromLatin1(), or use QLatin1Char,
 161     to construct a QChar from an 8-bit \c char, and you will need to
 162     call toAscii() or toLatin1() to get the 8-bit value back.
 163
 164     \sa Unicode, QString, QLatin1Char
 165 */
 166
 167 /*!
 168     \enum QChar::UnicodeVersion
 169
 170     Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
 171     introduced a certain character.
 172
 173     \value Unicode_1_1  Version 1.1
 174     \value Unicode_2_0  Version 2.0
 175     \value Unicode_2_1_2  Version 2.1.2
 176     \value Unicode_3_0  Version 3.0
 177     \value Unicode_3_1  Version 3.1
 178     \value Unicode_3_2  Version 3.2
 179     \value Unicode_4_0  Version 4.0
 180     \value Unicode_4_1  Version 4.1
 181     \value Unicode_5_0  Version 5.0
 182     \value Unicode_Unassigned  The value is not assigned to any character
 183         in version 5.0 of Unicode.
 184
 185     \sa unicodeVersion(), currentUnicodeVersion()
 186 */
 187
 188 /*!
 189     \enum QChar::Category
 190
 191     This enum maps the Unicode character categories.
 192
 193     The following characters are normative in Unicode:
 194
 195     \value Mark_NonSpacing  Unicode class name Mn
 196
 197     \value Mark_SpacingCombining  Unicode class name Mc
 198
 199     \value Mark_Enclosing  Unicode class name Me
 200
 201     \value Number_DecimalDigit  Unicode class name Nd
 202
 203     \value Number_Letter  Unicode class name Nl
 204
 205     \value Number_Other  Unicode class name No
 206
 207     \value Separator_Space  Unicode class name Zs
 208
 209     \value Separator_Line  Unicode class name Zl
 210
 211     \value Separator_Paragraph  Unicode class name Zp
 212
 213     \value Other_Control  Unicode class name Cc
 214
 215     \value Other_Format  Unicode class name Cf
 216
 217     \value Other_Surrogate  Unicode class name Cs
 218
 219     \value Other_PrivateUse  Unicode class name Co
 220
 221     \value Other_NotAssigned  Unicode class name Cn
 222
 223
 224     The following categories are informative in Unicode:
 225
 226     \value Letter_Uppercase  Unicode class name Lu
 227
 228     \value Letter_Lowercase  Unicode class name Ll
 229
 230     \value Letter_Titlecase  Unicode class name Lt
 231
 232     \value Letter_Modifier  Unicode class name Lm
 233
 234     \value Letter_Other Unicode class name Lo
 235
 236     \value Punctuation_Connector  Unicode class name Pc
 237
 238     \value Punctuation_Dash  Unicode class name Pd
 239
 240     \value Punctuation_Open  Unicode class name Ps
 241
 242     \value Punctuation_Close  Unicode class name Pe
 243
 244     \value Punctuation_InitialQuote  Unicode class name Pi
 245
 246     \value Punctuation_FinalQuote  Unicode class name Pf
 247
 248     \value Punctuation_Other  Unicode class name Po
 249
 250     \value Symbol_Math  Unicode class name Sm
 251
 252     \value Symbol_Currency  Unicode class name Sc
 253
 254     \value Symbol_Modifier  Unicode class name Sk
 255
 256     \value Symbol_Other  Unicode class name So
 257
 258     \sa category()
 259 */
 260
 261 /*!
 262     \enum QChar::Direction
 263
 264     This enum type defines the Unicode direction attributes. See the
 265     \l{http://www.unicode.org/}{Unicode Standard} for a description
 266     of the values.
 267
 268     In order to conform to C/C++ naming conventions "Dir" is prepended
 269     to the codes used in the Unicode Standard.
 270
 271     \value DirAL
 272     \value DirAN
 273     \value DirB
 274     \value DirBN
 275     \value DirCS
 276     \value DirEN
 277     \value DirES
 278     \value DirET
 279     \value DirL
 280     \value DirLRE
 281     \value DirLRO
 282     \value DirNSM
 283     \value DirON
 284     \value DirPDF
 285     \value DirR
 286     \value DirRLE
 287     \value DirRLO
 288     \value DirS
 289     \value DirWS
 290
 291     \sa direction()
 292 */
 293
 294 /*!
 295     \enum QChar::Decomposition
 296
 297     This enum type defines the Unicode decomposition attributes. See
 298     the \l{http://www.unicode.org/}{Unicode Standard} for a
 299     description of the values.
 300
 301     \value NoDecomposition
 302     \value Canonical
 303     \value Circle
 304     \value Compat
 305     \value Final
 306     \value Font
 307     \value Fraction
 308     \value Initial
 309     \value Isolated
 310     \value Medial
 311     \value Narrow
 312     \value NoBreak
 313     \value Small
 314     \value Square
 315     \value Sub
 316     \value Super
 317     \value Vertical
 318     \value Wide
 319
 320     \sa decomposition()
 321 */
 322
 323 /*!
 324     \enum QChar::Joining
 325
 326     This enum type defines the Unicode joining attributes. See the
 327     \l{http://www.unicode.org/}{Unicode Standard} for a description
 328     of the values.
 329
 330     \value Center
 331     \value Dual
 332     \value OtherJoining
 333     \value Right
 334
 335     \sa joining()
 336 */
 337
 338 /*!
 339     \enum QChar::CombiningClass
 340
 341     \internal
 342
 343     This enum type defines names for some of the Unicode combining
 344     classes. See the \l{http://www.unicode.org/}{Unicode Standard}
 345     for a description of the values.
 346
 347     \value Combining_Above
 348     \value Combining_AboveAttached
 349     \value Combining_AboveLeft
 350     \value Combining_AboveLeftAttached
 351     \value Combining_AboveRight
 352     \value Combining_AboveRightAttached
 353     \value Combining_Below
 354     \value Combining_BelowAttached
 355     \value Combining_BelowLeft
 356     \value Combining_BelowLeftAttached
 357     \value Combining_BelowRight
 358     \value Combining_BelowRightAttached
 359     \value Combining_DoubleAbove
 360     \value Combining_DoubleBelow
 361     \value Combining_IotaSubscript
 362     \value Combining_Left
 363     \value Combining_LeftAttached
 364     \value Combining_Right
 365     \value Combining_RightAttached
 366 */
 367
 368 /*!
 369     \enum QChar::SpecialCharacter
 370
 371     \value Null A QChar with this value isNull().
 372     \value Nbsp Non-breaking space.
 373     \value ReplacementCharacter The character shown when a font has no glyph
 374            for a certain codepoint. A special question mark character is often
 375            used. Codecs use this codepoint when input data cannot be
 376            represented in Unicode.
 377     \value ObjectReplacementCharacter Used to represent an object such as an
 378            image when such objects cannot be presented.
 379     \value ByteOrderMark
 380     \value ByteOrderSwapped
 381     \value ParagraphSeparator
 382     \value LineSeparator
 383 */
 384
 385 /*!
 386     \fn void QChar::setCell(uchar cell)
 387     \internal
 388 */
 389
 390 /*!
 391     \fn void QChar::setRow(uchar row)
 392     \internal
 393 */
 394
 395 /*!
 396     \fn QChar::QChar()
 397
 398     Constructs a null QChar ('\\0').
 399
 400     \sa isNull()
 401 */
 402
 403 /*!
 404     \fn QChar::QChar(QLatin1Char ch)
 405
 406     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 407 */
 408
 409 /*!
 410     \fn QChar::QChar(SpecialCharacter ch)
 411
 412     Constructs a QChar for the predefined character value \a ch.
 413 */
 414
 415 /*!
 416     \fn QChar::QChar(char ch)
 417
 418     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 419 */
 420
 421 /*!
 422     \fn QChar::QChar(uchar ch)
 423
 424     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 425 */
 426
 427 /*!
 428     \fn QChar::QChar(uchar cell, uchar row)
 429
 430     Constructs a QChar for Unicode cell \a cell in row \a row.
 431
 432     \sa cell(), row()
 433 */
 434
 435 /*!
 436     \fn QChar::QChar(ushort code)
 437
 438     Constructs a QChar for the character with Unicode code point \a code.
 439 */
 440
 441 /*!
 442     \fn QChar::QChar(short code)
 443
 444     Constructs a QChar for the character with Unicode code point \a code.
 445 */
 446
 447 /*!
 448     \fn QChar::QChar(uint code)
 449
 450     Constructs a QChar for the character with Unicode code point \a code.
 451 */
 452
 453 /*!
 454     \fn QChar::QChar(int code)
 455
 456     Constructs a QChar for the character with Unicode code point \a code.
 457 */
 458
 459 /*!
 460     \fn bool QChar::isNull() const
 461
 462     Returns true if the character is the Unicode character 0x0000
 463     ('\\0'); otherwise returns false.
 464 */
 465
 466 /*!
 467     \fn uchar QChar::cell() const
 468
 469     Returns the cell (least significant byte) of the Unicode character.
 470
 471     \sa row()
 472 */
 473
 474 /*!
 475     \fn uchar QChar::row() const
 476
 477     Returns the row (most significant byte) of the Unicode character.
 478
 479     \sa cell()
 480 */
 481
 482 /*!
 483     Returns true if the character is a printable character; otherwise
 484     returns false. This is any character not of category Cc or Cn.
 485
 486     Note that this gives no indication of whether the character is
 487     available in a particular font.
 488 */
 489 bool QChar::isPrint() const
 490 {
 491     const int test = FLAG(Other_Control) |
 492                      FLAG(Other_NotAssigned);
 493     return !(FLAG(qGetProp(ucs)->category) & test);
 494 }
 495
 496 /*!
 497     \fn bool QChar::isSpace() const
 498
 499     Returns true if the character is a separator character
 500     (Separator_* categories or certain code points from Other_Control category);
 501     otherwise returns false.
 502 */
 503
 504 /*!
 505     \internal
 506     \overload
 507 */
 508 bool QChar::isSpace(ushort ucs2)
 509 {
 510     const int test = FLAG(Separator_Space) |
 511                      FLAG(Separator_Line) |
 512                      FLAG(Separator_Paragraph);
 513     return FLAG(qGetProp(ucs2)->category) & test;
 514 }
 515
 516 /*!
 517     Returns true if the character is a mark (Mark_* categories);
 518     otherwise returns false.
 519
 520     See QChar::Category for more information regarding marks.
 521 */
 522 bool QChar::isMark() const
 523 {
 524     const int test = FLAG(Mark_NonSpacing) |
 525                      FLAG(Mark_SpacingCombining) |
 526                      FLAG(Mark_Enclosing);
 527     return FLAG(qGetProp(ucs)->category) & test;
 528 }
 529
 530 /*!
 531     Returns true if the character is a punctuation mark (Punctuation_*
 532     categories); otherwise returns false.
 533 */
 534 bool QChar::isPunct() const
 535 {
 536     const int test = FLAG(Punctuation_Connector) |
 537                      FLAG(Punctuation_Dash) |
 538                      FLAG(Punctuation_Open) |
 539                      FLAG(Punctuation_Close) |
 540                      FLAG(Punctuation_InitialQuote) |
 541                      FLAG(Punctuation_FinalQuote) |
 542                      FLAG(Punctuation_Other);
 543     return FLAG(qGetProp(ucs)->category) & test;
 544 }
 545
 546 /*!
 547     \fn bool QChar::isLetter() const
 548
 549     Returns true if the character is a letter (Letter_* categories);
 550     otherwise returns false.
 551 */
 552
 553 /*!
 554     \internal
 555     \overload
 556 */
 557 bool QChar::isLetter(ushort ucs2)
 558 {
 559     const int test = FLAG(Letter_Uppercase) |
 560                      FLAG(Letter_Lowercase) |
 561                      FLAG(Letter_Titlecase) |
 562                      FLAG(Letter_Modifier) |
 563                      FLAG(Letter_Other);
 564     return FLAG(qGetProp(ucs2)->category) & test;
 565 }
 566
 567 /*!
 568     Returns true if the character is a number (Number_* categories,
 569     not just 0-9); otherwise returns false.
 570
 571     \sa isDigit()
 572 */
 573 bool QChar::isNumber() const
 574 {
 575     const int test = FLAG(Number_DecimalDigit) |
 576                      FLAG(Number_Letter) |
 577                      FLAG(Number_Other);
 578     return FLAG(qGetProp(ucs)->category) & test;
 579 }
 580
 581 /*!
 582     \fn bool QChar::isLetterOrNumber() const
 583
 584     Returns true if the character is a letter or number (Letter_* or
 585     Number_* categories); otherwise returns false.
 586 */
 587
 588 /*!
 589     \internal
 590     \overload
 591 */
 592 bool QChar::isLetterOrNumber(ushort ucs2)
 593 {
 594     const int test = FLAG(Letter_Uppercase) |
 595                      FLAG(Letter_Lowercase) |
 596                      FLAG(Letter_Titlecase) |
 597                      FLAG(Letter_Modifier) |
 598                      FLAG(Letter_Other) |
 599                      FLAG(Number_DecimalDigit) |
 600                      FLAG(Number_Letter) |
 601                      FLAG(Number_Other);
 602     return FLAG(qGetProp(ucs2)->category) & test;
 603 }
 604
 605 /*!
 606     \fn bool QChar::isDigit() const
 607
 608     Returns true if the character is a decimal digit
 609     (Number_DecimalDigit); otherwise returns false.
 610 */
 611
 612 /*!
 613     \internal
 614     \overload
 615 */
 616 bool QChar::isDigit(ushort ucs2)
 617 {
 618     return (qGetProp(ucs2)->category == Number_DecimalDigit);
 619 }
 620
 621 /*!
 622     Returns true if the character is a symbol (Symbol_* categories);
 623     otherwise returns false.
 624 */
 625 bool QChar::isSymbol() const
 626 {
 627     const int test = FLAG(Symbol_Math) |
 628                      FLAG(Symbol_Currency) |
 629                      FLAG(Symbol_Modifier) |
 630                      FLAG(Symbol_Other);
 631     return FLAG(qGetProp(ucs)->category) & test;
 632 }
 633
 634 /*!
 635     \fn bool QChar::isHighSurrogate() const
 636
 637     Returns true if the QChar is the high part of a UTF16 surrogate
 638     (i.e. if it's code point in range [0xd800..0xdbff]).
 639 */
 640
 641 /*!
 642     \fn bool QChar::isLowSurrogate() const
 643
 644     Returns true if the QChar is the low part of a UTF16 surrogate
 645     (i.e. if it's code point in range [0xdc00..0xdfff]).
 646 */
 647
 648 /*!
 649     \fn static bool QChar::isHighSurrogate(uint ucs4)
 650     \overload
 651
 652     Returns true if the UCS-4-encoded character specified by \a ucs4
 653     is the high part of a UTF16 surrogate
 654     (i.e. if it's code point in range [0xd800..0xdbff]).
 655 */
 656
 657 /*!
 658     \fn static bool QChar::isLowSurrogate(uint ucs4)
 659     \overload
 660
 661     Returns true if the UCS-4-encoded character specified by \a ucs4
 662     is the low part of a UTF16 surrogate
 663     (i.e. if it's code point in range [0xdc00..0xdfff]).
 664 */
 665
 666 /*!
 667     \fn static bool QChar::requiresSurrogates(uint ucs4)
 668
 669     Returns true if the UCS-4-encoded character specified by \a ucs4
 670     can be split into the high and low parts of a UTF16 surrogate
 671     (i.e. if it's code point is greater than or equals to 0x10000).
 672 */
 673
 674 /*!
 675     \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
 676
 677     Converts a UTF16 surrogate pair with the given \a high and \a low values
 678     to it's UCS-4-encoded code point.
 679 */
 680
 681 /*!
 682     \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
 683     \overload
 684
 685     Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
 686 */
 687
 688 /*!
 689     \fn static ushort QChar::highSurrogate(uint ucs4)
 690
 691     Returns the high surrogate part of a UCS-4-encoded code point.
 692     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 693 */
 694
 695 /*!
 696     \fn static ushort QChar::lowSurrogate(uint ucs4)
 697
 698     Returns the low surrogate part of a UCS-4-encoded code point.
 699     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 700 */
 701
 702 /*!
 703     Returns the numeric value of the digit, or -1 if the character is not a digit.
 704 */
 705 int QChar::digitValue() const
 706 {
 707     return qGetProp(ucs)->digitValue;
 708 }
 709
 710 /*!
 711     \overload
 712     Returns the numeric value of the digit, specified by the UCS-2-encoded
 713     character, \a ucs2, or -1 if the character is not a digit.
 714 */
 715 int QChar::digitValue(ushort ucs2)
 716 {
 717     return qGetProp(ucs2)->digitValue;
 718 }
 719
 720 /*!
 721     \overload
 722     Returns the numeric value of the digit specified by the UCS-4-encoded
 723     character, \a ucs4, or -1 if the character is not a digit.
 724 */
 725 int QChar::digitValue(uint ucs4)
 726 {
 727     if (ucs4 > UNICODE_LAST_CODEPOINT)
 728         return 0;
 729     return qGetProp(ucs4)->digitValue;
 730 }
 731
 732 /*!
 733     Returns the character's category.
 734 */
 735 QChar::Category QChar::category() const
 736 {
 737     return (QChar::Category) qGetProp(ucs)->category;
 738 }
 739
 740 /*!
 741     \overload
 742     Returns the category of the UCS-4-encoded character specified by \a ucs4.
 743 */
 744 QChar::Category QChar::category(uint ucs4)
 745 {
 746     if (ucs4 > UNICODE_LAST_CODEPOINT)
 747         return QChar::Other_NotAssigned;
 748     return (QChar::Category) qGetProp(ucs4)->category;
 749 }
 750
 751 /*!
 752     \overload
 753     Returns the category of the UCS-2-encoded character specified by \a ucs2.
 754 */
 755 QChar::Category QChar::category(ushort ucs2)
 756 {
 757     return (QChar::Category) qGetProp(ucs2)->category;
 758 }
 759
 760
 761 /*!
 762     Returns the character's direction.
 763 */
 764 QChar::Direction QChar::direction() const
 765 {
 766     return (QChar::Direction) qGetProp(ucs)->direction;
 767 }
 768
 769 /*!
 770     \overload
 771     Returns the direction of the UCS-4-encoded character specified by \a ucs4.
 772 */
 773 QChar::Direction QChar::direction(uint ucs4)
 774 {
 775     if (ucs4 > UNICODE_LAST_CODEPOINT)
 776         return QChar::DirL;
 777     return (QChar::Direction) qGetProp(ucs4)->direction;
 778 }
 779
 780 /*!
 781     \overload
 782     Returns the direction of the UCS-2-encoded character specified by \a ucs2.
 783 */
 784 QChar::Direction QChar::direction(ushort ucs2)
 785 {
 786     return (QChar::Direction) qGetProp(ucs2)->direction;
 787 }
 788
 789 /*!
 790     Returns information about the joining properties of the character
 791     (needed for certain languages such as Arabic).
 792 */
 793 QChar::Joining QChar::joining() const
 794 {
 795     return (QChar::Joining) qGetProp(ucs)->joining;
 796 }
 797
 798 /*!
 799     \overload
 800     Returns information about the joining properties of the UCS-4-encoded
 801     character specified by \a ucs4 (needed for certain languages such as Arabic).
 802 */
 803 QChar::Joining QChar::joining(uint ucs4)
 804 {
 805     if (ucs4 > UNICODE_LAST_CODEPOINT)
 806         return QChar::OtherJoining;
 807     return (QChar::Joining) qGetProp(ucs4)->joining;
 808 }
 809
 810 /*!
 811     \overload
 812     Returns information about the joining properties of the UCS-2-encoded
 813     character specified by \a ucs2 (needed for certain languages such as Arabic).
 814 */
 815 QChar::Joining QChar::joining(ushort ucs2)
 816 {
 817     return (QChar::Joining) qGetProp(ucs2)->joining;
 818 }
 819
 820 /*!
 821     Returns true if the character should be reversed if the text
 822     direction is reversed; otherwise returns false.
 823
 824     Same as (ch.mirroredChar() != ch).
 825
 826     \sa mirroredChar()
 827 */
 828 bool QChar::hasMirrored() const
 829 {
 830     return qGetProp(ucs)->mirrorDiff != 0;
 831 }
 832
 833 /*!
 834     \fn bool QChar::isLower() const
 835
 836     Returns true if the character is a lowercase letter, i.e.
 837     category() is Letter_Lowercase.
 838
 839     \sa isUpper(), toLower(), toUpper()
 840 */
 841
 842 /*!
 843     \fn bool QChar::isUpper() const
 844
 845     Returns true if the character is an uppercase letter, i.e.
 846     category() is Letter_Uppercase.
 847
 848     \sa isLower(), toUpper(), toLower()
 849 */
 850
 851 /*!
 852     \fn bool QChar::isTitleCase() const
 853
 854     Returns true if the character is a titlecase letter, i.e.
 855     category() is Letter_Titlecase.
 856
 857     \sa isLower(), toUpper(), toLower(), toTitleCase()
 858 */
 859
 860 /*!
 861     Returns the mirrored character if this character is a mirrored
 862     character; otherwise returns the character itself.
 863
 864     \sa hasMirrored()
 865 */
 866 QChar QChar::mirroredChar() const
 867 {
 868     return ucs + qGetProp(ucs)->mirrorDiff;
 869 }
 870
 871 /*!
 872     \overload
 873     Returns the mirrored character if the UCS-4-encoded character specified
 874     by \a ucs4 is a mirrored character; otherwise returns the character itself.
 875
 876     \sa hasMirrored()
 877 */
 878 uint QChar::mirroredChar(uint ucs4)
 879 {
 880     if (ucs4 > UNICODE_LAST_CODEPOINT)
 881         return ucs4;
 882     return ucs4 + qGetProp(ucs4)->mirrorDiff;
 883 }
 884
 885 /*!
 886     \overload
 887     Returns the mirrored character if the UCS-2-encoded character specified
 888     by \a ucs2 is a mirrored character; otherwise returns the character itself.
 889
 890     \sa hasMirrored()
 891 */
 892 ushort QChar::mirroredChar(ushort ucs2)
 893 {
 894     return ucs2 + qGetProp(ucs2)->mirrorDiff;
 895 }
 896
 897
 898 enum {
 899     Hangul_SBase = 0xac00,
 900     Hangul_LBase = 0x1100,
 901     Hangul_VBase = 0x1161,
 902     Hangul_TBase = 0x11a7,
 903     Hangul_SCount = 11172,
 904     Hangul_LCount = 19,
 905     Hangul_VCount = 21,
 906     Hangul_TCount = 28,
 907     Hangul_NCount = 21*28
 908 };
 909
 910 // buffer has to have a length of 3. It's needed for Hangul decomposition
 911 static const unsigned short * QT_FASTCALL decompositionHelper
 912     (uint ucs4, int *length, int *tag, unsigned short *buffer)
 913 {
 914     *length = 0;
 915     if (ucs4 > UNICODE_LAST_CODEPOINT)
 916         return 0;
 917     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
 918         int SIndex = ucs4 - Hangul_SBase;
 919         buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
 920         buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
 921         buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
 922         *length = buffer[2] == Hangul_TBase ? 2 : 3;
 923         *tag = QChar::Canonical;
 924         return buffer;
 925     }
 926
 927     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
 928     if (index == 0xffff)
 929         return 0;
 930     const unsigned short *decomposition = uc_decomposition_map+index;
 931     *tag = (*decomposition) & 0xff;
 932     *length = (*decomposition) >> 8;
 933     return decomposition+1;
 934 }
 935
 936 /*!
 937     Decomposes a character into it's constituent parts. Returns an empty string
 938     if no decomposition exists.
 939 */
 940 QString QChar::decomposition() const
 941 {
 942     return decomposition(ucs);
 943 }
 944
 945 /*!
 946     \overload
 947     Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
 948     constituent parts. Returns an empty string if no decomposition exists.
 949 */
 950 QString QChar::decomposition(uint ucs4)
 951 {
 952     unsigned short buffer[3];
 953     int length;
 954     int tag;
 955     const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
 956     return QString::fromUtf16(d, length);
 957 }
 958
 959 /*!
 960     Returns the tag defining the composition of the character. Returns
 961     QChar::NoDecomposition if no decomposition exists.
 962 */
 963 QChar::Decomposition QChar::decompositionTag() const
 964 {
 965     return decompositionTag(ucs);
 966 }
 967
 968 /*!
 969     \overload
 970     Returns the tag defining the composition of the UCS-4-encoded character
 971     specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
 972 */
 973 QChar::Decomposition QChar::decompositionTag(uint ucs4)
 974 {
 975     if (ucs4 > UNICODE_LAST_CODEPOINT)
 976         return QChar::NoDecomposition;
 977     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
 978     if (index == 0xffff)
 979         return QChar::NoDecomposition;
 980     return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
 981 }
 982
 983 /*!
 984     Returns the combining class for the character as defined in the
 985     Unicode standard. This is mainly useful as a positioning hint for
 986     marks attached to a base character.
 987
 988     The Qt text rendering engine uses this information to correctly
 989     position non-spacing marks around a base character.
 990 */
 991 unsigned char QChar::combiningClass() const
 992 {
 993     return (unsigned char) qGetProp(ucs)->combiningClass;
 994 }
 995
 996 /*!
 997     \overload
 998     Returns the combining class for the UCS-4-encoded character specified by
 999     \a ucs4, as defined in the Unicode standard.
1000 */
1001 unsigned char QChar::combiningClass(uint ucs4)
1002 {
1003     if (ucs4 > UNICODE_LAST_CODEPOINT)
1004         return 0;
1005     return (unsigned char) qGetProp(ucs4)->combiningClass;
1006 }
1007
1008 /*!
1009     \overload
1010     Returns the combining class for the UCS-2-encoded character specified by
1011     \a ucs2, as defined in the Unicode standard.
1012 */
1013 unsigned char QChar::combiningClass(ushort ucs2)
1014 {
1015     return (unsigned char) qGetProp(ucs2)->combiningClass;
1016 }
1017
1018 /*!
1019     Returns the Unicode version that introduced this character.
1020 */
1021 QChar::UnicodeVersion QChar::unicodeVersion() const
1022 {
1023     return (QChar::UnicodeVersion) qGetProp(ucs)->unicodeVersion;
1024 }
1025
1026 /*!
1027     \overload
1028     Returns the Unicode version that introduced the character specified in
1029     its UCS-4-encoded form as \a ucs4.
1030 */
1031 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
1032 {
1033     if (ucs4 > UNICODE_LAST_CODEPOINT)
1034         return QChar::Unicode_Unassigned;
1035     return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1036 }
1037
1038 /*!
1039     \overload
1040     Returns the Unicode version that introduced the character specified in
1041     its UCS-2-encoded form as \a ucs2.
1042 */
1043 QChar::UnicodeVersion QChar::unicodeVersion(ushort ucs2)
1044 {
1045     return (QChar::UnicodeVersion) qGetProp(ucs2)->unicodeVersion;
1046 }
1047
1048 /*!
1049     Returns the most recent supported Unicode version.
1050 */
1051 QChar::UnicodeVersion QChar::currentUnicodeVersion()
1052 {
1053     return UNICODE_DATA_VERSION;
1054 }
1055
1056 /*!
1057     Returns the lowercase equivalent if the character is uppercase or titlecase;
1058     otherwise returns the character itself.
1059 */
1060 QChar QChar::toLower() const
1061 {
1062     const QUnicodeTables::Properties *p = qGetProp(ucs);
1063     if (!p->lowerCaseSpecial)
1064         return ucs + p->lowerCaseDiff;
1065     return ucs;
1066 }
1067
1068 /*!
1069     \overload
1070     Returns the lowercase equivalent of the UCS-4-encoded character specified
1071     by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1072     the character itself.
1073 */
1074 uint QChar::toLower(uint ucs4)
1075 {
1076     if (ucs4 > UNICODE_LAST_CODEPOINT)
1077         return ucs4;
1078     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1079     if (!p->lowerCaseSpecial)
1080         return ucs4 + p->lowerCaseDiff;
1081     return ucs4;
1082 }
1083
1084 /*!
1085     \overload
1086     Returns the lowercase equivalent of the UCS-2-encoded character specified
1087     by \a ucs2 if the character is uppercase or titlecase; otherwise returns
1088     the character itself.
1089 */
1090 ushort QChar::toLower(ushort ucs2)
1091 {
1092     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1093     if (!p->lowerCaseSpecial)
1094         return ucs2 + p->lowerCaseDiff;
1095     return ucs2;
1096 }
1097
1098 /*!
1099     Returns the uppercase equivalent if the character is lowercase or titlecase;
1100     otherwise returns the character itself.
1101 */
1102 QChar QChar::toUpper() const
1103 {
1104     const QUnicodeTables::Properties *p = qGetProp(ucs);
1105     if (!p->upperCaseSpecial)
1106         return ucs + p->upperCaseDiff;
1107     return ucs;
1108 }
1109
1110 /*!
1111     \overload
1112     Returns the uppercase equivalent of the UCS-4-encoded character specified
1113     by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1114     the character itself.
1115 */
1116 uint QChar::toUpper(uint ucs4)
1117 {
1118     if (ucs4 > UNICODE_LAST_CODEPOINT)
1119         return ucs4;
1120     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1121     if (!p->upperCaseSpecial)
1122         return ucs4 + p->upperCaseDiff;
1123     return ucs4;
1124 }
1125
1126 /*!
1127     \overload
1128     Returns the uppercase equivalent of the UCS-2-encoded character specified
1129     by \a ucs2 if the character is lowercase or titlecase; otherwise returns
1130     the character itself.
1131 */
1132 ushort QChar::toUpper(ushort ucs2)
1133 {
1134     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1135     if (!p->upperCaseSpecial)
1136         return ucs2 + p->upperCaseDiff;
1137     return ucs2;
1138 }
1139
1140 /*!
1141     Returns the title case equivalent if the character is lowercase or uppercase;
1142     otherwise returns the character itself.
1143 */
1144 QChar QChar::toTitleCase() const
1145 {
1146     const QUnicodeTables::Properties *p = qGetProp(ucs);
1147     if (!p->titleCaseSpecial)
1148         return ucs + p->titleCaseDiff;
1149     return ucs;
1150 }
1151
1152 /*!
1153     \overload
1154     Returns the title case equivalent of the UCS-4-encoded character specified
1155     by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1156     the character itself.
1157 */
1158 uint QChar::toTitleCase(uint ucs4)
1159 {
1160     if (ucs4 > UNICODE_LAST_CODEPOINT)
1161         return ucs4;
1162     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1163     if (!p->titleCaseSpecial)
1164         return ucs4 + p->titleCaseDiff;
1165     return ucs4;
1166 }
1167
1168 /*!
1169     \overload
1170     Returns the title case equivalent of the UCS-2-encoded character specified
1171     by \a ucs2 if the character is lowercase or uppercase; otherwise returns
1172     the character itself.
1173 */
1174 ushort QChar::toTitleCase(ushort ucs2)
1175 {
1176     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1177     if (!p->titleCaseSpecial)
1178         return ucs2 + p->titleCaseDiff;
1179     return ucs2;
1180 }
1181
1182
1183 static inline uint foldCase(const ushort *ch, const ushort *start)
1184 {
1185     uint c = *ch;
1186     if (QChar(c).isLowSurrogate() && ch > start && QChar(*(ch - 1)).isHighSurrogate())
1187         c = QChar::surrogateToUcs4(*(ch - 1), c);
1188     return *ch + qGetProp(c)->caseFoldDiff;
1189 }
1190
1191 static inline uint foldCase(uint ch, uint &last)
1192 {
1193     uint c = ch;
1194     if (QChar(c).isLowSurrogate() && QChar(last).isHighSurrogate())
1195         c = QChar::surrogateToUcs4(last, c);
1196     last = ch;
1197     return ch + qGetProp(c)->caseFoldDiff;
1198 }
1199
1200 static inline ushort foldCase(ushort ch)
1201 {
1202     return ch + qGetProp(ch)->caseFoldDiff;
1203 }
1204
1205 /*!
1206     Returns the case folded equivalent of the character. For most Unicode characters this
1207     is the same as toLowerCase().
1208 */
1209 QChar QChar::toCaseFolded() const
1210 {
1211     return ucs + qGetProp(ucs)->caseFoldDiff;
1212 }
1213
1214 /*!
1215     \overload
1216     Returns the case folded equivalent of the UCS-4-encoded character specified
1217     by \a ucs4. For most Unicode characters this is the same as toLowerCase().
1218 */
1219 uint QChar::toCaseFolded(uint ucs4)
1220 {
1221     if (ucs4 > UNICODE_LAST_CODEPOINT)
1222         return ucs4;
1223     return ucs4 + qGetProp(ucs4)->caseFoldDiff;
1224 }
1225
1226 /*!
1227     \overload
1228     Returns the case folded equivalent of the UCS-2-encoded character specified
1229     by \a ucs2. For most Unicode characters this is the same as toLowerCase().
1230 */
1231 ushort QChar::toCaseFolded(ushort ucs2)
1232 {
1233     return ucs2 + qGetProp(ucs2)->caseFoldDiff;
1234 }
1235
1236 /*!
1237     \fn char QChar::toLatin1() const
1238
1239     Returns the Latin-1 character equivalent to the QChar, or 0. This
1240     is mainly useful for non-internationalized software.
1241
1242     \sa toAscii(), unicode()
1243 */
1244
1245 /*!
1246     \fn char QChar::toAscii() const
1247
1248     Returns the Latin-1 character value of the QChar, or 0 if the character is not
1249     representable.
1250
1251     The main purpose of this function is to preserve ASCII characters used
1252     in C strings. This is mainly useful for developers of non-internationalized
1253     software.
1254
1255     \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1256     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1257
1258     \sa toLatin1(), unicode()
1259 */
1260
1261 /*!
1262     \fn QChar QChar::fromAscii(char)
1263
1264     Converts the ASCII character \a c to it's equivalent QChar. This
1265     is mainly useful for non-internationalized software.
1266
1267     An alternative is to use QLatin1Char.
1268
1269     \sa fromLatin1(), unicode()
1270 */
1271
1272 #ifndef QT_NO_DATASTREAM
1273 /*!
1274     \relates QChar
1275
1276     Writes the char \a chr to the stream \a out.
1277
1278     \sa {Serializing Qt Data Types}
1279 */
1280 QDataStream &operator<<(QDataStream &out, QChar chr)
1281 {
1282     out << quint16(chr.unicode());
1283     return out;
1284 }
1285
1286 /*!
1287     \relates QChar
1288
1289     Reads a char from the stream \a in into char \a chr.
1290
1291     \sa {Serializing Qt Data Types}
1292 */
1293 QDataStream &operator>>(QDataStream &in, QChar &chr)
1294 {
1295     quint16 u;
1296     in >> u;
1297     chr.unicode() = ushort(u);
1298     return in;
1299 }
1300 #endif // QT_NO_DATASTREAM
1301
1302 /*!
1303     \fn ushort & QChar::unicode()
1304
1305     Returns a reference to the numeric Unicode value of the QChar.
1306 */
1307
1308 /*!
1309     \fn ushort QChar::unicode() const
1310
1311     \overload
1312 */
1313
1314 /*****************************************************************************
1315   Documentation of QChar related functions
1316  *****************************************************************************/
1317
1318 /*!
1319     \fn bool operator==(QChar c1, QChar c2)
1320
1321     \relates QChar
1322
1323     Returns true if \a c1 and \a c2 are the same Unicode character;
1324     otherwise returns false.
1325 */
1326
1327 /*!
1328     \fn int operator!=(QChar c1, QChar c2)
1329
1330     \relates QChar
1331
1332     Returns true if \a c1 and \a c2 are not the same Unicode
1333     character; otherwise returns false.
1334 */
1335
1336 /*!
1337     \fn int operator<=(QChar c1, QChar c2)
1338
1339     \relates QChar
1340
1341     Returns true if the numeric Unicode value of \a c1 is less than
1342     or equal to that of \a c2; otherwise returns false.
1343 */
1344
1345 /*!
1346     \fn int operator>=(QChar c1, QChar c2)
1347
1348     \relates QChar
1349
1350     Returns true if the numeric Unicode value of \a c1 is greater than
1351     or equal to that of \a c2; otherwise returns false.
1352 */
1353
1354 /*!
1355     \fn int operator<(QChar c1, QChar c2)
1356
1357     \relates QChar
1358
1359     Returns true if the numeric Unicode value of \a c1 is less than
1360     that of \a c2; otherwise returns false.
1361 */
1362
1363 /*!
1364     \fn int operator>(QChar c1, QChar c2)
1365
1366     \relates QChar
1367
1368     Returns true if the numeric Unicode value of \a c1 is greater than
1369     that of \a c2; otherwise returns false.
1370 */
1371
1372
1373 // ---------------------------------------------------------------------------
1374
1375
1376 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1377 {
1378     unsigned short buffer[3];
1379
1380     QString &s = *str;
1381
1382     const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1383     const unsigned short *uc = utf16 + s.length();
1384     while (uc != utf16 + from) {
1385         uint ucs4 = *(--uc);
1386         if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1387             ushort high = *(uc - 1);
1388             if (QChar(high).isHighSurrogate()) {
1389                 --uc;
1390                 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1391             }
1392         }
1393         const QChar::UnicodeVersion v = QChar::unicodeVersion(ucs4);
1394         if (v > version || v == QChar::Unicode_Unassigned)
1395             continue;
1396         int length;
1397         int tag;
1398         const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1399         if (!d || (canonical && tag != QChar::Canonical))
1400             continue;
1401
1402         int pos = uc - utf16;
1403         s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1404         // since the insert invalidates the pointers and we do decomposition recursive
1405         utf16 = reinterpret_cast<unsigned short *>(s.data());
1406         uc = utf16 + pos + length;
1407     }
1408 }
1409
1410
1411 struct UCS2Pair {
1412     ushort u1;
1413     ushort u2;
1414 };
1415
1416 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1417 { return u1 < ligature.u1; }
1418 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1419 { return ligature.u1 < u1; }
1420
1421 static ushort ligatureHelper(ushort u1, ushort u2)
1422 {
1423     // hangul L-V pair
1424     int LIndex = u1 - Hangul_LBase;
1425     if (0 <= LIndex && LIndex < Hangul_LCount) {
1426         int VIndex = u2 - Hangul_VBase;
1427         if (0 <= VIndex && VIndex < Hangul_VCount)
1428             return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1429     }
1430
1431     // hangul LV-T pair
1432     int SIndex = u1 - Hangul_SBase;
1433     if (0 <= SIndex && SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1434         int TIndex = u2 - Hangul_TBase;
1435         if (0 <= TIndex && TIndex <= Hangul_TCount)
1436             return u1 + TIndex;
1437     }
1438
1439     const unsigned short index = GET_LIGATURE_INDEX(u2);
1440     if (index == 0xffff)
1441         return 0;
1442     const unsigned short *ligatures = uc_ligature_map+index;
1443     ushort length = *ligatures++;
1444     {
1445         const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1446         const UCS2Pair *r = qBinaryFind(data, data + length, u1);
1447         if (r != data + length)
1448             return r->u2;
1449     }
1450
1451     return 0;
1452 }
1453
1454 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1455 {
1456     QString &s = *str;
1457
1458     if (s.length() - from < 2)
1459         return;
1460
1461     // the loop can partly ignore high Unicode as all ligatures are in the BMP
1462     int starter = 0;
1463     int lastCombining = 0;
1464     int pos = from;
1465     while (pos < s.length()) {
1466         uint uc = s.at(pos).unicode();
1467         if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1468             ushort low = s.at(pos+1).unicode();
1469             if (QChar(low).isLowSurrogate()) {
1470                 uc = QChar::surrogateToUcs4(uc, low);
1471                 ++pos;
1472             }
1473         }
1474         const QUnicodeTables::Properties *p = qGetProp(uc);
1475         if (p->unicodeVersion > version || p->unicodeVersion == QChar::Unicode_Unassigned) {
1476             starter = -1; // to prevent starter == pos - 1
1477             lastCombining = 0;
1478             ++pos;
1479             continue;
1480         }
1481         int combining = p->combiningClass;
1482         if (starter == pos - 1 || combining > lastCombining) {
1483             // allowed to form ligature with S
1484             QChar ligature = ligatureHelper(s.at(starter).unicode(), uc);
1485             if (ligature.unicode()) {
1486                 s[starter] = ligature;
1487                 s.remove(pos, 1);
1488                 continue;
1489             }
1490         }
1491         if (!combining)
1492             starter = pos;
1493         lastCombining = combining;
1494         ++pos;
1495     }
1496 }
1497
1498
1499 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1500 {
1501     QString &s = *str;
1502     const int l = s.length()-1;
1503     int pos = from;
1504     while (pos < l) {
1505         int p2 = pos+1;
1506         uint u1 = s.at(pos).unicode();
1507         if (QChar(u1).isHighSurrogate()) {
1508             ushort low = s.at(p2).unicode();
1509             if (QChar(low).isLowSurrogate()) {
1510                 u1 = QChar::surrogateToUcs4(u1, low);
1511                 if (p2 >= l)
1512                     break;
1513                 ++p2;
1514             }
1515         }
1516         uint u2 = s.at(p2).unicode();
1517         if (QChar(u2).isHighSurrogate() && p2 < l) {
1518             ushort low = s.at(p2+1).unicode();
1519             if (QChar(low).isLowSurrogate()) {
1520                 u2 = QChar::surrogateToUcs4(u2, low);
1521                 ++p2;
1522             }
1523         }
1524
1525         ushort c2 = 0;
1526         {
1527             const QUnicodeTables::Properties *p = qGetProp(u2);
1528             if (p->unicodeVersion <= version && p->unicodeVersion != QChar::Unicode_Unassigned)
1529                 c2 = p->combiningClass;
1530         }
1531         if (c2 == 0) {
1532             pos = p2+1;
1533             continue;
1534         }
1535
1536         ushort c1 = 0;
1537         {
1538             const QUnicodeTables::Properties *p = qGetProp(u1);
1539             if (p->unicodeVersion <= version && p->unicodeVersion != QChar::Unicode_Unassigned)
1540                 c1 = p->combiningClass;
1541         }
1542
1543         if (c1 > c2) {
1544             QChar *uc = s.data();
1545             int p = pos;
1546             // exchange characters
1547             if (!QChar::requiresSurrogates(u2)) {
1548                 uc[p++] = u2;
1549             } else {
1550                 uc[p++] = QChar::highSurrogate(u2);
1551                 uc[p++] = QChar::lowSurrogate(u2);
1552             }
1553             if (!QChar::requiresSurrogates(u1)) {
1554                 uc[p++] = u1;
1555             } else {
1556                 uc[p++] = QChar::highSurrogate(u1);
1557                 uc[p++] = QChar::lowSurrogate(u1);
1558             }
1559             if (pos > 0)
1560                 --pos;
1561             if (pos > 0 && s.at(pos).isLowSurrogate())
1562                 --pos;
1563         } else {
1564             ++pos;
1565             if (QChar::requiresSurrogates(u1))
1566                 ++pos;
1567         }
1568     }
1569 }
1570
1571 QT_END_NAMESPACE