src/corelib/tools/qchar.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 // Don't define it while compiling this module, or USERS of Qt will
  43 // not be able to link.
  44 #ifdef QT_NO_CAST_FROM_ASCII
  45 #  undef QT_NO_CAST_FROM_ASCII
  46 #endif
  47 #ifdef QT_NO_CAST_TO_ASCII
  48 #  undef QT_NO_CAST_TO_ASCII
  49 #endif
  50 #include "qchar.h"
  51
  52 #include "qdatastream.h"
  53
  54 #include "qunicodetables_p.h"
  55 #include "qunicodetables.cpp"
  56
  57 QT_BEGIN_NAMESPACE
  58
  59 #define FLAG(x) (1 << (x))
  60
  61 /*!
  62     \class QLatin1Char
  63     \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
  64
  65     \ingroup string-processing
  66
  67     This class is only useful to construct a QChar with 8-bit character.
  68
  69     \sa QChar, QLatin1String, QString
  70 */
  71
  72 /*!
  73     \fn const char QLatin1Char::toLatin1() const
  74
  75     Converts a Latin-1 character to an 8-bit ASCII representation of the character.
  76 */
  77
  78 /*!
  79     \fn const ushort QLatin1Char::unicode() const
  80
  81     Converts a Latin-1 character to an 16-bit-encoded Unicode representation
  82     of the character.
  83 */
  84
  85 /*!
  86     \fn QLatin1Char::QLatin1Char(char c)
  87
  88     Constructs a Latin-1 character for \a c. This constructor should be
  89     used when the encoding of the input character is known to be Latin-1.
  90 */
  91
  92 /*!
  93     \class QChar
  94     \brief The QChar class provides a 16-bit Unicode character.
  95
  96     \ingroup string-processing
  97     \reentrant
  98
  99     In Qt, Unicode characters are 16-bit entities without any markup
 100     or structure. This class represents such an entity. It is
 101     lightweight, so it can be used everywhere. Most compilers treat
 102     it like a \c{unsigned short}.
 103
 104     QChar provides a full complement of testing/classification
 105     functions, converting to and from other formats, converting from
 106     composed to decomposed Unicode, and trying to compare and
 107     case-convert if you ask it to.
 108
 109     The classification functions include functions like those in the
 110     standard C++ header \<cctype\> (formerly \<ctype.h\>), but
 111     operating on the full range of Unicode characters. They all
 112     return true if the character is a certain type of character;
 113     otherwise they return false. These classification functions are
 114     isNull() (returns true if the character is '\\0'), isPrint()
 115     (true if the character is any sort of printable character,
 116     including whitespace), isPunct() (any sort of punctation),
 117     isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
 118     sort of numeric character, not just 0-9), isLetterOrNumber(), and
 119     isDigit() (decimal digits). All of these are wrappers around
 120     category() which return the Unicode-defined category of each
 121     character.
 122
 123     QChar also provides direction(), which indicates the "natural"
 124     writing direction of this character. The joining() function
 125     indicates how the character joins with it's neighbors (needed
 126     mostly for Arabic) and finally hasMirrored(), which indicates
 127     whether the character needs to be mirrored when it is printed in
 128     it's "unnatural" writing direction.
 129
 130     Composed Unicode characters (like \a ring) can be converted to
 131     decomposed Unicode ("a" followed by "ring above") by using decomposition().
 132
 133     In Unicode, comparison is not necessarily possible and case
 134     conversion is very difficult at best. Unicode, covering the
 135     "entire" world, also includes most of the world's case and
 136     sorting problems. operator==() and friends will do comparison
 137     based purely on the numeric Unicode value (code point) of the
 138     characters, and toUpper() and toLower() will do case changes when
 139     the character has a well-defined uppercase/lowercase equivalent.
 140     For locale-dependent comparisons, use QString::localeAwareCompare().
 141
 142     The conversion functions include unicode() (to a scalar),
 143     toLatin1() (to scalar, but converts all non-Latin-1 characters to
 144     0), row() (gives the Unicode row), cell() (gives the Unicode
 145     cell), digitValue() (gives the integer value of any of the
 146     numerous digit characters), and a host of constructors.
 147
 148     QChar provides constructors and cast operators that make it easy
 149     to convert to and from traditional 8-bit \c{char}s. If you
 150     defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
 151     explained in the QString documentation, you will need to
 152     explicitly call fromAscii() or fromLatin1(), or use QLatin1Char,
 153     to construct a QChar from an 8-bit \c char, and you will need to
 154     call toAscii() or toLatin1() to get the 8-bit value back.
 155
 156     \sa Unicode, QString, QLatin1Char
 157 */
 158
 159 /*!
 160     \enum QChar::UnicodeVersion
 161
 162     Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
 163     introduced a certain character.
 164
 165     \value Unicode_1_1  Version 1.1
 166     \value Unicode_2_0  Version 2.0
 167     \value Unicode_2_1_2  Version 2.1.2
 168     \value Unicode_3_0  Version 3.0
 169     \value Unicode_3_1  Version 3.1
 170     \value Unicode_3_2  Version 3.2
 171     \value Unicode_4_0  Version 4.0
 172     \value Unicode_4_1  Version 4.1
 173     \value Unicode_5_0  Version 5.0
 174     \value Unicode_Unassigned  The value is not assigned to any character
 175         in version 5.0 of Unicode.
 176
 177     \sa unicodeVersion(), currentUnicodeVersion()
 178 */
 179
 180 /*!
 181     \enum QChar::Category
 182
 183     This enum maps the Unicode character categories.
 184
 185     The following characters are normative in Unicode:
 186
 187     \value Mark_NonSpacing  Unicode class name Mn
 188
 189     \value Mark_SpacingCombining  Unicode class name Mc
 190
 191     \value Mark_Enclosing  Unicode class name Me
 192
 193     \value Number_DecimalDigit  Unicode class name Nd
 194
 195     \value Number_Letter  Unicode class name Nl
 196
 197     \value Number_Other  Unicode class name No
 198
 199     \value Separator_Space  Unicode class name Zs
 200
 201     \value Separator_Line  Unicode class name Zl
 202
 203     \value Separator_Paragraph  Unicode class name Zp
 204
 205     \value Other_Control  Unicode class name Cc
 206
 207     \value Other_Format  Unicode class name Cf
 208
 209     \value Other_Surrogate  Unicode class name Cs
 210
 211     \value Other_PrivateUse  Unicode class name Co
 212
 213     \value Other_NotAssigned  Unicode class name Cn
 214
 215
 216     The following categories are informative in Unicode:
 217
 218     \value Letter_Uppercase  Unicode class name Lu
 219
 220     \value Letter_Lowercase  Unicode class name Ll
 221
 222     \value Letter_Titlecase  Unicode class name Lt
 223
 224     \value Letter_Modifier  Unicode class name Lm
 225
 226     \value Letter_Other Unicode class name Lo
 227
 228     \value Punctuation_Connector  Unicode class name Pc
 229
 230     \value Punctuation_Dash  Unicode class name Pd
 231
 232     \value Punctuation_Open  Unicode class name Ps
 233
 234     \value Punctuation_Close  Unicode class name Pe
 235
 236     \value Punctuation_InitialQuote  Unicode class name Pi
 237
 238     \value Punctuation_FinalQuote  Unicode class name Pf
 239
 240     \value Punctuation_Other  Unicode class name Po
 241
 242     \value Symbol_Math  Unicode class name Sm
 243
 244     \value Symbol_Currency  Unicode class name Sc
 245
 246     \value Symbol_Modifier  Unicode class name Sk
 247
 248     \value Symbol_Other  Unicode class name So
 249
 250     \sa category()
 251 */
 252
 253 /*!
 254     \enum QChar::Direction
 255
 256     This enum type defines the Unicode direction attributes. See the
 257     \l{http://www.unicode.org/}{Unicode Standard} for a description
 258     of the values.
 259
 260     In order to conform to C/C++ naming conventions "Dir" is prepended
 261     to the codes used in the Unicode Standard.
 262
 263     \value DirAL
 264     \value DirAN
 265     \value DirB
 266     \value DirBN
 267     \value DirCS
 268     \value DirEN
 269     \value DirES
 270     \value DirET
 271     \value DirL
 272     \value DirLRE
 273     \value DirLRO
 274     \value DirNSM
 275     \value DirON
 276     \value DirPDF
 277     \value DirR
 278     \value DirRLE
 279     \value DirRLO
 280     \value DirS
 281     \value DirWS
 282
 283     \sa direction()
 284 */
 285
 286 /*!
 287     \enum QChar::Decomposition
 288
 289     This enum type defines the Unicode decomposition attributes. See
 290     the \l{http://www.unicode.org/}{Unicode Standard} for a
 291     description of the values.
 292
 293     \value NoDecomposition
 294     \value Canonical
 295     \value Circle
 296     \value Compat
 297     \value Final
 298     \value Font
 299     \value Fraction
 300     \value Initial
 301     \value Isolated
 302     \value Medial
 303     \value Narrow
 304     \value NoBreak
 305     \value Small
 306     \value Square
 307     \value Sub
 308     \value Super
 309     \value Vertical
 310     \value Wide
 311
 312     \sa decomposition()
 313 */
 314
 315 /*!
 316     \enum QChar::Joining
 317
 318     This enum type defines the Unicode joining attributes. See the
 319     \l{http://www.unicode.org/}{Unicode Standard} for a description
 320     of the values.
 321
 322     \value Center
 323     \value Dual
 324     \value OtherJoining
 325     \value Right
 326
 327     \sa joining()
 328 */
 329
 330 /*!
 331     \enum QChar::CombiningClass
 332
 333     \internal
 334
 335     This enum type defines names for some of the Unicode combining
 336     classes. See the \l{http://www.unicode.org/}{Unicode Standard}
 337     for a description of the values.
 338
 339     \value Combining_Above
 340     \value Combining_AboveAttached
 341     \value Combining_AboveLeft
 342     \value Combining_AboveLeftAttached
 343     \value Combining_AboveRight
 344     \value Combining_AboveRightAttached
 345     \value Combining_Below
 346     \value Combining_BelowAttached
 347     \value Combining_BelowLeft
 348     \value Combining_BelowLeftAttached
 349     \value Combining_BelowRight
 350     \value Combining_BelowRightAttached
 351     \value Combining_DoubleAbove
 352     \value Combining_DoubleBelow
 353     \value Combining_IotaSubscript
 354     \value Combining_Left
 355     \value Combining_LeftAttached
 356     \value Combining_Right
 357     \value Combining_RightAttached
 358 */
 359
 360 /*!
 361     \enum QChar::SpecialCharacter
 362
 363     \value Null A QChar with this value isNull().
 364     \value Nbsp Non-breaking space.
 365     \value ReplacementCharacter The character shown when a font has no glyph
 366            for a certain codepoint. A special question mark character is often
 367            used. Codecs use this codepoint when input data cannot be
 368            represented in Unicode.
 369     \value ObjectReplacementCharacter Used to represent an object such as an
 370            image when such objects cannot be presented.
 371     \value ByteOrderMark
 372     \value ByteOrderSwapped
 373     \value ParagraphSeparator
 374     \value LineSeparator
 375 */
 376
 377 /*!
 378     \fn void QChar::setCell(uchar cell)
 379     \internal
 380 */
 381
 382 /*!
 383     \fn void QChar::setRow(uchar row)
 384     \internal
 385 */
 386
 387 /*!
 388     \fn QChar::QChar()
 389
 390     Constructs a null QChar ('\\0').
 391
 392     \sa isNull()
 393 */
 394
 395 /*!
 396     \fn QChar::QChar(QLatin1Char ch)
 397
 398     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 399 */
 400
 401 /*!
 402     \fn QChar::QChar(SpecialCharacter ch)
 403
 404     Constructs a QChar for the predefined character value \a ch.
 405 */
 406
 407 /*!
 408     \fn QChar::QChar(char ch)
 409
 410     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 411 */
 412
 413 /*!
 414     \fn QChar::QChar(uchar ch)
 415
 416     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 417 */
 418
 419 /*!
 420     \fn QChar::QChar(uchar cell, uchar row)
 421
 422     Constructs a QChar for Unicode cell \a cell in row \a row.
 423
 424     \sa cell(), row()
 425 */
 426
 427 /*!
 428     \fn QChar::QChar(ushort code)
 429
 430     Constructs a QChar for the character with Unicode code point \a code.
 431 */
 432
 433 /*!
 434     \fn QChar::QChar(short code)
 435
 436     Constructs a QChar for the character with Unicode code point \a code.
 437 */
 438
 439 /*!
 440     \fn QChar::QChar(uint code)
 441
 442     Constructs a QChar for the character with Unicode code point \a code.
 443 */
 444
 445 /*!
 446     \fn QChar::QChar(int code)
 447
 448     Constructs a QChar for the character with Unicode code point \a code.
 449 */
 450
 451 /*!
 452     \fn bool QChar::isNull() const
 453
 454     Returns true if the character is the Unicode character 0x0000
 455     ('\\0'); otherwise returns false.
 456 */
 457
 458 /*!
 459     \fn uchar QChar::cell() const
 460
 461     Returns the cell (least significant byte) of the Unicode character.
 462
 463     \sa row()
 464 */
 465
 466 /*!
 467     \fn uchar QChar::row() const
 468
 469     Returns the row (most significant byte) of the Unicode character.
 470
 471     \sa cell()
 472 */
 473
 474 /*!
 475     Returns true if the character is a printable character; otherwise
 476     returns false. This is any character not of category Cc or Cn.
 477
 478     Note that this gives no indication of whether the character is
 479     available in a particular font.
 480 */
 481 bool QChar::isPrint() const
 482 {
 483     const int test = FLAG(Other_Control) |
 484                      FLAG(Other_NotAssigned);
 485     return !(FLAG(qGetProp(ucs)->category) & test);
 486 }
 487
 488 /*!
 489     \fn bool QChar::isSpace() const
 490
 491     Returns true if the character is a separator character
 492     (Separator_* categories or certain code points from Other_Control category);
 493     otherwise returns false.
 494 */
 495
 496 /*!
 497     \internal
 498     \overload
 499 */
 500 bool QChar::isSpace(ushort ucs2)
 501 {
 502     const int test = FLAG(Separator_Space) |
 503                      FLAG(Separator_Line) |
 504                      FLAG(Separator_Paragraph);
 505     return FLAG(qGetProp(ucs2)->category) & test;
 506 }
 507
 508 /*!
 509     Returns true if the character is a mark (Mark_* categories);
 510     otherwise returns false.
 511
 512     See QChar::Category for more information regarding marks.
 513 */
 514 bool QChar::isMark() const
 515 {
 516     const int test = FLAG(Mark_NonSpacing) |
 517                      FLAG(Mark_SpacingCombining) |
 518                      FLAG(Mark_Enclosing);
 519     return FLAG(qGetProp(ucs)->category) & test;
 520 }
 521
 522 /*!
 523     Returns true if the character is a punctuation mark (Punctuation_*
 524     categories); otherwise returns false.
 525 */
 526 bool QChar::isPunct() const
 527 {
 528     const int test = FLAG(Punctuation_Connector) |
 529                      FLAG(Punctuation_Dash) |
 530                      FLAG(Punctuation_Open) |
 531                      FLAG(Punctuation_Close) |
 532                      FLAG(Punctuation_InitialQuote) |
 533                      FLAG(Punctuation_FinalQuote) |
 534                      FLAG(Punctuation_Other);
 535     return FLAG(qGetProp(ucs)->category) & test;
 536 }
 537
 538 /*!
 539     \fn bool QChar::isLetter() const
 540
 541     Returns true if the character is a letter (Letter_* categories);
 542     otherwise returns false.
 543 */
 544
 545 /*!
 546     \internal
 547     \overload
 548 */
 549 bool QChar::isLetter(ushort ucs2)
 550 {
 551     const int test = FLAG(Letter_Uppercase) |
 552                      FLAG(Letter_Lowercase) |
 553                      FLAG(Letter_Titlecase) |
 554                      FLAG(Letter_Modifier) |
 555                      FLAG(Letter_Other);
 556     return FLAG(qGetProp(ucs2)->category) & test;
 557 }
 558
 559 /*!
 560     Returns true if the character is a number (Number_* categories,
 561     not just 0-9); otherwise returns false.
 562
 563     \sa isDigit()
 564 */
 565 bool QChar::isNumber() const
 566 {
 567     const int test = FLAG(Number_DecimalDigit) |
 568                      FLAG(Number_Letter) |
 569                      FLAG(Number_Other);
 570     return FLAG(qGetProp(ucs)->category) & test;
 571 }
 572
 573 /*!
 574     \fn bool QChar::isLetterOrNumber() const
 575
 576     Returns true if the character is a letter or number (Letter_* or
 577     Number_* categories); otherwise returns false.
 578 */
 579
 580 /*!
 581     \internal
 582     \overload
 583 */
 584 bool QChar::isLetterOrNumber(ushort ucs2)
 585 {
 586     const int test = FLAG(Letter_Uppercase) |
 587                      FLAG(Letter_Lowercase) |
 588                      FLAG(Letter_Titlecase) |
 589                      FLAG(Letter_Modifier) |
 590                      FLAG(Letter_Other) |
 591                      FLAG(Number_DecimalDigit) |
 592                      FLAG(Number_Letter) |
 593                      FLAG(Number_Other);
 594     return FLAG(qGetProp(ucs2)->category) & test;
 595 }
 596
 597 /*!
 598     \fn bool QChar::isDigit() const
 599
 600     Returns true if the character is a decimal digit
 601     (Number_DecimalDigit); otherwise returns false.
 602 */
 603
 604 /*!
 605     \internal
 606     \overload
 607 */
 608 bool QChar::isDigit(ushort ucs2)
 609 {
 610     return (qGetProp(ucs2)->category == Number_DecimalDigit);
 611 }
 612
 613 /*!
 614     Returns true if the character is a symbol (Symbol_* categories);
 615     otherwise returns false.
 616 */
 617 bool QChar::isSymbol() const
 618 {
 619     const int test = FLAG(Symbol_Math) |
 620                      FLAG(Symbol_Currency) |
 621                      FLAG(Symbol_Modifier) |
 622                      FLAG(Symbol_Other);
 623     return FLAG(qGetProp(ucs)->category) & test;
 624 }
 625
 626 /*!
 627     \fn bool QChar::isHighSurrogate() const
 628
 629     Returns true if the QChar is the high part of a UTF16 surrogate
 630     (i.e. if it's code point in range [0xd800..0xdbff]).
 631 */
 632
 633 /*!
 634     \fn bool QChar::isLowSurrogate() const
 635
 636     Returns true if the QChar is the low part of a UTF16 surrogate
 637     (i.e. if it's code point in range [0xdc00..0xdfff]).
 638 */
 639
 640 /*!
 641     \fn static bool QChar::isHighSurrogate(uint ucs4)
 642     \overload
 643
 644     Returns true if the UCS-4-encoded character specified by \a ucs4
 645     is the high part of a UTF16 surrogate
 646     (i.e. if it's code point in range [0xd800..0xdbff]).
 647 */
 648
 649 /*!
 650     \fn static bool QChar::isLowSurrogate(uint ucs4)
 651     \overload
 652
 653     Returns true if the UCS-4-encoded character specified by \a ucs4
 654     is the low part of a UTF16 surrogate
 655     (i.e. if it's code point in range [0xdc00..0xdfff]).
 656 */
 657
 658 /*!
 659     \fn static bool QChar::requiresSurrogates(uint ucs4)
 660
 661     Returns true if the UCS-4-encoded character specified by \a ucs4
 662     can be split into the high and low parts of a UTF16 surrogate
 663     (i.e. if it's code point is greater than or equals to 0x10000).
 664 */
 665
 666 /*!
 667     \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
 668
 669     Converts a UTF16 surrogate pair with the given \a high and \a low values
 670     to it's UCS-4-encoded code point.
 671 */
 672
 673 /*!
 674     \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
 675     \overload
 676
 677     Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
 678 */
 679
 680 /*!
 681     \fn static ushort QChar::highSurrogate(uint ucs4)
 682
 683     Returns the high surrogate part of a UCS-4-encoded code point.
 684     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 685 */
 686
 687 /*!
 688     \fn static ushort QChar::lowSurrogate(uint ucs4)
 689
 690     Returns the low surrogate part of a UCS-4-encoded code point.
 691     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 692 */
 693
 694 /*!
 695     Returns the numeric value of the digit, or -1 if the character is not a digit.
 696 */
 697 int QChar::digitValue() const
 698 {
 699     return qGetProp(ucs)->digitValue;
 700 }
 701
 702 /*!
 703     \overload
 704     Returns the numeric value of the digit, specified by the UCS-2-encoded
 705     character, \a ucs2, or -1 if the character is not a digit.
 706 */
 707 int QChar::digitValue(ushort ucs2)
 708 {
 709     return qGetProp(ucs2)->digitValue;
 710 }
 711
 712 /*!
 713     \overload
 714     Returns the numeric value of the digit specified by the UCS-4-encoded
 715     character, \a ucs4, or -1 if the character is not a digit.
 716 */
 717 int QChar::digitValue(uint ucs4)
 718 {
 719     if (ucs4 > UNICODE_LAST_CODEPOINT)
 720         return -1;
 721     return qGetProp(ucs4)->digitValue;
 722 }
 723
 724 /*!
 725     Returns the character's category.
 726 */
 727 QChar::Category QChar::category() const
 728 {
 729     return (QChar::Category) qGetProp(ucs)->category;
 730 }
 731
 732 /*!
 733     \overload
 734     Returns the category of the UCS-4-encoded character specified by \a ucs4.
 735 */
 736 QChar::Category QChar::category(uint ucs4)
 737 {
 738     if (ucs4 > UNICODE_LAST_CODEPOINT)
 739         return QChar::Other_NotAssigned;
 740     return (QChar::Category) qGetProp(ucs4)->category;
 741 }
 742
 743 /*!
 744     \overload
 745     Returns the category of the UCS-2-encoded character specified by \a ucs2.
 746 */
 747 QChar::Category QChar::category(ushort ucs2)
 748 {
 749     return (QChar::Category) qGetProp(ucs2)->category;
 750 }
 751
 752
 753 /*!
 754     Returns the character's direction.
 755 */
 756 QChar::Direction QChar::direction() const
 757 {
 758     return (QChar::Direction) qGetProp(ucs)->direction;
 759 }
 760
 761 /*!
 762     \overload
 763     Returns the direction of the UCS-4-encoded character specified by \a ucs4.
 764 */
 765 QChar::Direction QChar::direction(uint ucs4)
 766 {
 767     if (ucs4 > UNICODE_LAST_CODEPOINT)
 768         return QChar::DirL;
 769     return (QChar::Direction) qGetProp(ucs4)->direction;
 770 }
 771
 772 /*!
 773     \overload
 774     Returns the direction of the UCS-2-encoded character specified by \a ucs2.
 775 */
 776 QChar::Direction QChar::direction(ushort ucs2)
 777 {
 778     return (QChar::Direction) qGetProp(ucs2)->direction;
 779 }
 780
 781 /*!
 782     Returns information about the joining properties of the character
 783     (needed for certain languages such as Arabic).
 784 */
 785 QChar::Joining QChar::joining() const
 786 {
 787     return (QChar::Joining) qGetProp(ucs)->joining;
 788 }
 789
 790 /*!
 791     \overload
 792     Returns information about the joining properties of the UCS-4-encoded
 793     character specified by \a ucs4 (needed for certain languages such as Arabic).
 794 */
 795 QChar::Joining QChar::joining(uint ucs4)
 796 {
 797     if (ucs4 > UNICODE_LAST_CODEPOINT)
 798         return QChar::OtherJoining;
 799     return (QChar::Joining) qGetProp(ucs4)->joining;
 800 }
 801
 802 /*!
 803     \overload
 804     Returns information about the joining properties of the UCS-2-encoded
 805     character specified by \a ucs2 (needed for certain languages such as Arabic).
 806 */
 807 QChar::Joining QChar::joining(ushort ucs2)
 808 {
 809     return (QChar::Joining) qGetProp(ucs2)->joining;
 810 }
 811
 812 /*!
 813     Returns true if the character should be reversed if the text
 814     direction is reversed; otherwise returns false.
 815
 816     Same as (ch.mirroredChar() != ch).
 817
 818     \sa mirroredChar()
 819 */
 820 bool QChar::hasMirrored() const
 821 {
 822     return qGetProp(ucs)->mirrorDiff != 0;
 823 }
 824
 825 /*!
 826     \fn bool QChar::isLower() const
 827
 828     Returns true if the character is a lowercase letter, i.e.
 829     category() is Letter_Lowercase.
 830
 831     \sa isUpper(), toLower(), toUpper()
 832 */
 833
 834 /*!
 835     \fn bool QChar::isUpper() const
 836
 837     Returns true if the character is an uppercase letter, i.e.
 838     category() is Letter_Uppercase.
 839
 840     \sa isLower(), toUpper(), toLower()
 841 */
 842
 843 /*!
 844     \fn bool QChar::isTitleCase() const
 845
 846     Returns true if the character is a titlecase letter, i.e.
 847     category() is Letter_Titlecase.
 848
 849     \sa isLower(), toUpper(), toLower(), toTitleCase()
 850 */
 851
 852 /*!
 853     Returns the mirrored character if this character is a mirrored
 854     character; otherwise returns the character itself.
 855
 856     \sa hasMirrored()
 857 */
 858 QChar QChar::mirroredChar() const
 859 {
 860     return ucs + qGetProp(ucs)->mirrorDiff;
 861 }
 862
 863 /*!
 864     \overload
 865     Returns the mirrored character if the UCS-4-encoded character specified
 866     by \a ucs4 is a mirrored character; otherwise returns the character itself.
 867
 868     \sa hasMirrored()
 869 */
 870 uint QChar::mirroredChar(uint ucs4)
 871 {
 872     if (ucs4 > UNICODE_LAST_CODEPOINT)
 873         return ucs4;
 874     return ucs4 + qGetProp(ucs4)->mirrorDiff;
 875 }
 876
 877 /*!
 878     \overload
 879     Returns the mirrored character if the UCS-2-encoded character specified
 880     by \a ucs2 is a mirrored character; otherwise returns the character itself.
 881
 882     \sa hasMirrored()
 883 */
 884 ushort QChar::mirroredChar(ushort ucs2)
 885 {
 886     return ucs2 + qGetProp(ucs2)->mirrorDiff;
 887 }
 888
 889
 890 enum {
 891     Hangul_SBase = 0xac00,
 892     Hangul_LBase = 0x1100,
 893     Hangul_VBase = 0x1161,
 894     Hangul_TBase = 0x11a7,
 895     Hangul_SCount = 11172,
 896     Hangul_LCount = 19,
 897     Hangul_VCount = 21,
 898     Hangul_TCount = 28,
 899     Hangul_NCount = 21*28
 900 };
 901
 902 // buffer has to have a length of 3. It's needed for Hangul decomposition
 903 static const unsigned short * QT_FASTCALL decompositionHelper
 904     (uint ucs4, int *length, int *tag, unsigned short *buffer)
 905 {
 906     *length = 0;
 907     if (ucs4 > UNICODE_LAST_CODEPOINT)
 908         return 0;
 909     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
 910         int SIndex = ucs4 - Hangul_SBase;
 911         buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
 912         buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
 913         buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
 914         *length = buffer[2] == Hangul_TBase ? 2 : 3;
 915         *tag = QChar::Canonical;
 916         return buffer;
 917     }
 918
 919     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
 920     if (index == 0xffff)
 921         return 0;
 922     const unsigned short *decomposition = uc_decomposition_map+index;
 923     *tag = (*decomposition) & 0xff;
 924     *length = (*decomposition) >> 8;
 925     return decomposition+1;
 926 }
 927
 928 /*!
 929     Decomposes a character into it's constituent parts. Returns an empty string
 930     if no decomposition exists.
 931 */
 932 QString QChar::decomposition() const
 933 {
 934     return decomposition(ucs);
 935 }
 936
 937 /*!
 938     \overload
 939     Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
 940     constituent parts. Returns an empty string if no decomposition exists.
 941 */
 942 QString QChar::decomposition(uint ucs4)
 943 {
 944     unsigned short buffer[3];
 945     int length;
 946     int tag;
 947     const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
 948     return QString::fromUtf16(d, length);
 949 }
 950
 951 /*!
 952     Returns the tag defining the composition of the character. Returns
 953     QChar::NoDecomposition if no decomposition exists.
 954 */
 955 QChar::Decomposition QChar::decompositionTag() const
 956 {
 957     return decompositionTag(ucs);
 958 }
 959
 960 /*!
 961     \overload
 962     Returns the tag defining the composition of the UCS-4-encoded character
 963     specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
 964 */
 965 QChar::Decomposition QChar::decompositionTag(uint ucs4)
 966 {
 967     if (ucs4 > UNICODE_LAST_CODEPOINT)
 968         return QChar::NoDecomposition;
 969     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
 970     if (index == 0xffff)
 971         return QChar::NoDecomposition;
 972     return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
 973 }
 974
 975 /*!
 976     Returns the combining class for the character as defined in the
 977     Unicode standard. This is mainly useful as a positioning hint for
 978     marks attached to a base character.
 979
 980     The Qt text rendering engine uses this information to correctly
 981     position non-spacing marks around a base character.
 982 */
 983 unsigned char QChar::combiningClass() const
 984 {
 985     return (unsigned char) qGetProp(ucs)->combiningClass;
 986 }
 987
 988 /*!
 989     \overload
 990     Returns the combining class for the UCS-4-encoded character specified by
 991     \a ucs4, as defined in the Unicode standard.
 992 */
 993 unsigned char QChar::combiningClass(uint ucs4)
 994 {
 995     if (ucs4 > UNICODE_LAST_CODEPOINT)
 996         return 0;
 997     return (unsigned char) qGetProp(ucs4)->combiningClass;
 998 }
 999
1000 /*!
1001     \overload
1002     Returns the combining class for the UCS-2-encoded character specified by
1003     \a ucs2, as defined in the Unicode standard.
1004 */
1005 unsigned char QChar::combiningClass(ushort ucs2)
1006 {
1007     return (unsigned char) qGetProp(ucs2)->combiningClass;
1008 }
1009
1010 /*!
1011     Returns the Unicode version that introduced this character.
1012 */
1013 QChar::UnicodeVersion QChar::unicodeVersion() const
1014 {
1015     return (QChar::UnicodeVersion) qGetProp(ucs)->unicodeVersion;
1016 }
1017
1018 /*!
1019     \overload
1020     Returns the Unicode version that introduced the character specified in
1021     its UCS-4-encoded form as \a ucs4.
1022 */
1023 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
1024 {
1025     if (ucs4 > UNICODE_LAST_CODEPOINT)
1026         return QChar::Unicode_Unassigned;
1027     return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1028 }
1029
1030 /*!
1031     \overload
1032     Returns the Unicode version that introduced the character specified in
1033     its UCS-2-encoded form as \a ucs2.
1034 */
1035 QChar::UnicodeVersion QChar::unicodeVersion(ushort ucs2)
1036 {
1037     return (QChar::UnicodeVersion) qGetProp(ucs2)->unicodeVersion;
1038 }
1039
1040 /*!
1041     Returns the most recent supported Unicode version.
1042 */
1043 QChar::UnicodeVersion QChar::currentUnicodeVersion()
1044 {
1045     return UNICODE_DATA_VERSION;
1046 }
1047
1048 /*!
1049     Returns the lowercase equivalent if the character is uppercase or titlecase;
1050     otherwise returns the character itself.
1051 */
1052 QChar QChar::toLower() const
1053 {
1054     const QUnicodeTables::Properties *p = qGetProp(ucs);
1055     if (!p->lowerCaseSpecial)
1056         return ucs + p->lowerCaseDiff;
1057     return ucs;
1058 }
1059
1060 /*!
1061     \overload
1062     Returns the lowercase equivalent of the UCS-4-encoded character specified
1063     by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1064     the character itself.
1065 */
1066 uint QChar::toLower(uint ucs4)
1067 {
1068     if (ucs4 > UNICODE_LAST_CODEPOINT)
1069         return ucs4;
1070     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1071     if (!p->lowerCaseSpecial)
1072         return ucs4 + p->lowerCaseDiff;
1073     return ucs4;
1074 }
1075
1076 /*!
1077     \overload
1078     Returns the lowercase equivalent of the UCS-2-encoded character specified
1079     by \a ucs2 if the character is uppercase or titlecase; otherwise returns
1080     the character itself.
1081 */
1082 ushort QChar::toLower(ushort ucs2)
1083 {
1084     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1085     if (!p->lowerCaseSpecial)
1086         return ucs2 + p->lowerCaseDiff;
1087     return ucs2;
1088 }
1089
1090 /*!
1091     Returns the uppercase equivalent if the character is lowercase or titlecase;
1092     otherwise returns the character itself.
1093 */
1094 QChar QChar::toUpper() const
1095 {
1096     const QUnicodeTables::Properties *p = qGetProp(ucs);
1097     if (!p->upperCaseSpecial)
1098         return ucs + p->upperCaseDiff;
1099     return ucs;
1100 }
1101
1102 /*!
1103     \overload
1104     Returns the uppercase equivalent of the UCS-4-encoded character specified
1105     by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1106     the character itself.
1107 */
1108 uint QChar::toUpper(uint ucs4)
1109 {
1110     if (ucs4 > UNICODE_LAST_CODEPOINT)
1111         return ucs4;
1112     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1113     if (!p->upperCaseSpecial)
1114         return ucs4 + p->upperCaseDiff;
1115     return ucs4;
1116 }
1117
1118 /*!
1119     \overload
1120     Returns the uppercase equivalent of the UCS-2-encoded character specified
1121     by \a ucs2 if the character is lowercase or titlecase; otherwise returns
1122     the character itself.
1123 */
1124 ushort QChar::toUpper(ushort ucs2)
1125 {
1126     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1127     if (!p->upperCaseSpecial)
1128         return ucs2 + p->upperCaseDiff;
1129     return ucs2;
1130 }
1131
1132 /*!
1133     Returns the title case equivalent if the character is lowercase or uppercase;
1134     otherwise returns the character itself.
1135 */
1136 QChar QChar::toTitleCase() const
1137 {
1138     const QUnicodeTables::Properties *p = qGetProp(ucs);
1139     if (!p->titleCaseSpecial)
1140         return ucs + p->titleCaseDiff;
1141     return ucs;
1142 }
1143
1144 /*!
1145     \overload
1146     Returns the title case equivalent of the UCS-4-encoded character specified
1147     by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1148     the character itself.
1149 */
1150 uint QChar::toTitleCase(uint ucs4)
1151 {
1152     if (ucs4 > UNICODE_LAST_CODEPOINT)
1153         return ucs4;
1154     const QUnicodeTables::Properties *p = qGetProp(ucs4);
1155     if (!p->titleCaseSpecial)
1156         return ucs4 + p->titleCaseDiff;
1157     return ucs4;
1158 }
1159
1160 /*!
1161     \overload
1162     Returns the title case equivalent of the UCS-2-encoded character specified
1163     by \a ucs2 if the character is lowercase or uppercase; otherwise returns
1164     the character itself.
1165 */
1166 ushort QChar::toTitleCase(ushort ucs2)
1167 {
1168     const QUnicodeTables::Properties *p = qGetProp(ucs2);
1169     if (!p->titleCaseSpecial)
1170         return ucs2 + p->titleCaseDiff;
1171     return ucs2;
1172 }
1173
1174
1175 static inline uint foldCase(const ushort *ch, const ushort *start)
1176 {
1177     uint c = *ch;
1178     if (QChar(c).isLowSurrogate() && ch > start && QChar(*(ch - 1)).isHighSurrogate())
1179         c = QChar::surrogateToUcs4(*(ch - 1), c);
1180     return *ch + qGetProp(c)->caseFoldDiff;
1181 }
1182
1183 static inline uint foldCase(uint ch, uint &last)
1184 {
1185     uint c = ch;
1186     if (QChar(c).isLowSurrogate() && QChar(last).isHighSurrogate())
1187         c = QChar::surrogateToUcs4(last, c);
1188     last = ch;
1189     return ch + qGetProp(c)->caseFoldDiff;
1190 }
1191
1192 static inline ushort foldCase(ushort ch)
1193 {
1194     return ch + qGetProp(ch)->caseFoldDiff;
1195 }
1196
1197 /*!
1198     Returns the case folded equivalent of the character. For most Unicode characters this
1199     is the same as toLowerCase().
1200 */
1201 QChar QChar::toCaseFolded() const
1202 {
1203     return ucs + qGetProp(ucs)->caseFoldDiff;
1204 }
1205
1206 /*!
1207     \overload
1208     Returns the case folded equivalent of the UCS-4-encoded character specified
1209     by \a ucs4. For most Unicode characters this is the same as toLowerCase().
1210 */
1211 uint QChar::toCaseFolded(uint ucs4)
1212 {
1213     if (ucs4 > UNICODE_LAST_CODEPOINT)
1214         return ucs4;
1215     return ucs4 + qGetProp(ucs4)->caseFoldDiff;
1216 }
1217
1218 /*!
1219     \overload
1220     Returns the case folded equivalent of the UCS-2-encoded character specified
1221     by \a ucs2. For most Unicode characters this is the same as toLowerCase().
1222 */
1223 ushort QChar::toCaseFolded(ushort ucs2)
1224 {
1225     return ucs2 + qGetProp(ucs2)->caseFoldDiff;
1226 }
1227
1228 /*!
1229     \fn char QChar::toLatin1() const
1230
1231     Returns the Latin-1 character equivalent to the QChar, or 0. This
1232     is mainly useful for non-internationalized software.
1233
1234     \sa toAscii(), unicode()
1235 */
1236
1237 /*!
1238     \fn char QChar::toAscii() const
1239
1240     Returns the Latin-1 character value of the QChar, or 0 if the character is not
1241     representable.
1242
1243     The main purpose of this function is to preserve ASCII characters used
1244     in C strings. This is mainly useful for developers of non-internationalized
1245     software.
1246
1247     \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1248     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1249
1250     \sa toLatin1(), unicode()
1251 */
1252
1253 /*!
1254     \fn QChar QChar::fromAscii(char)
1255
1256     Converts the ASCII character \a c to it's equivalent QChar. This
1257     is mainly useful for non-internationalized software.
1258
1259     An alternative is to use QLatin1Char.
1260
1261     \sa fromLatin1(), unicode()
1262 */
1263
1264 #ifndef QT_NO_DATASTREAM
1265 /*!
1266     \relates QChar
1267
1268     Writes the char \a chr to the stream \a out.
1269
1270     \sa {Serializing Qt Data Types}
1271 */
1272 QDataStream &operator<<(QDataStream &out, QChar chr)
1273 {
1274     out << quint16(chr.unicode());
1275     return out;
1276 }
1277
1278 /*!
1279     \relates QChar
1280
1281     Reads a char from the stream \a in into char \a chr.
1282
1283     \sa {Serializing Qt Data Types}
1284 */
1285 QDataStream &operator>>(QDataStream &in, QChar &chr)
1286 {
1287     quint16 u;
1288     in >> u;
1289     chr.unicode() = ushort(u);
1290     return in;
1291 }
1292 #endif // QT_NO_DATASTREAM
1293
1294 /*!
1295     \fn ushort & QChar::unicode()
1296
1297     Returns a reference to the numeric Unicode value of the QChar.
1298 */
1299
1300 /*!
1301     \fn ushort QChar::unicode() const
1302
1303     \overload
1304 */
1305
1306 /*****************************************************************************
1307   Documentation of QChar related functions
1308  *****************************************************************************/
1309
1310 /*!
1311     \fn bool operator==(QChar c1, QChar c2)
1312
1313     \relates QChar
1314
1315     Returns true if \a c1 and \a c2 are the same Unicode character;
1316     otherwise returns false.
1317 */
1318
1319 /*!
1320     \fn int operator!=(QChar c1, QChar c2)
1321
1322     \relates QChar
1323
1324     Returns true if \a c1 and \a c2 are not the same Unicode
1325     character; otherwise returns false.
1326 */
1327
1328 /*!
1329     \fn int operator<=(QChar c1, QChar c2)
1330
1331     \relates QChar
1332
1333     Returns true if the numeric Unicode value of \a c1 is less than
1334     or equal to that of \a c2; otherwise returns false.
1335 */
1336
1337 /*!
1338     \fn int operator>=(QChar c1, QChar c2)
1339
1340     \relates QChar
1341
1342     Returns true if the numeric Unicode value of \a c1 is greater than
1343     or equal to that of \a c2; otherwise returns false.
1344 */
1345
1346 /*!
1347     \fn int operator<(QChar c1, QChar c2)
1348
1349     \relates QChar
1350
1351     Returns true if the numeric Unicode value of \a c1 is less than
1352     that of \a c2; otherwise returns false.
1353 */
1354
1355 /*!
1356     \fn int operator>(QChar c1, QChar c2)
1357
1358     \relates QChar
1359
1360     Returns true if the numeric Unicode value of \a c1 is greater than
1361     that of \a c2; otherwise returns false.
1362 */
1363
1364
1365 // ---------------------------------------------------------------------------
1366
1367
1368 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1369 {
1370     unsigned short buffer[3];
1371
1372     QString &s = *str;
1373
1374     const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1375     const unsigned short *uc = utf16 + s.length();
1376     while (uc != utf16 + from) {
1377         uint ucs4 = *(--uc);
1378         if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1379             ushort high = *(uc - 1);
1380             if (QChar(high).isHighSurrogate()) {
1381                 --uc;
1382                 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1383             }
1384         }
1385         const QChar::UnicodeVersion v = QChar::unicodeVersion(ucs4);
1386         if (v > version || v == QChar::Unicode_Unassigned)
1387             continue;
1388         int length;
1389         int tag;
1390         const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1391         if (!d || (canonical && tag != QChar::Canonical))
1392             continue;
1393
1394         int pos = uc - utf16;
1395         s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1396         // since the insert invalidates the pointers and we do decomposition recursive
1397         utf16 = reinterpret_cast<unsigned short *>(s.data());
1398         uc = utf16 + pos + length;
1399     }
1400 }
1401
1402
1403 struct UCS2Pair {
1404     ushort u1;
1405     ushort u2;
1406 };
1407
1408 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1409 { return u1 < ligature.u1; }
1410 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1411 { return ligature.u1 < u1; }
1412
1413 static ushort ligatureHelper(ushort u1, ushort u2)
1414 {
1415     // hangul L-V pair
1416     int LIndex = u1 - Hangul_LBase;
1417     if (0 <= LIndex && LIndex < Hangul_LCount) {
1418         int VIndex = u2 - Hangul_VBase;
1419         if (0 <= VIndex && VIndex < Hangul_VCount)
1420             return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1421     }
1422
1423     // hangul LV-T pair
1424     int SIndex = u1 - Hangul_SBase;
1425     if (0 <= SIndex && SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1426         int TIndex = u2 - Hangul_TBase;
1427         if (0 <= TIndex && TIndex <= Hangul_TCount)
1428             return u1 + TIndex;
1429     }
1430
1431     const unsigned short index = GET_LIGATURE_INDEX(u2);
1432     if (index == 0xffff)
1433         return 0;
1434     const unsigned short *ligatures = uc_ligature_map+index;
1435     ushort length = *ligatures++;
1436     {
1437         const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1438         const UCS2Pair *r = qBinaryFind(data, data + length, u1);
1439         if (r != data + length)
1440             return r->u2;
1441     }
1442
1443     return 0;
1444 }
1445
1446 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1447 {
1448     QString &s = *str;
1449
1450     if (s.length() - from < 2)
1451         return;
1452
1453     // the loop can partly ignore high Unicode as all ligatures are in the BMP
1454     int starter = 0;
1455     int lastCombining = 0;
1456     int pos = from;
1457     while (pos < s.length()) {
1458         uint uc = s.at(pos).unicode();
1459         if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1460             ushort low = s.at(pos+1).unicode();
1461             if (QChar(low).isLowSurrogate()) {
1462                 uc = QChar::surrogateToUcs4(uc, low);
1463                 ++pos;
1464             }
1465         }
1466         const QUnicodeTables::Properties *p = qGetProp(uc);
1467         if (p->unicodeVersion > version || p->unicodeVersion == QChar::Unicode_Unassigned) {
1468             starter = -1; // to prevent starter == pos - 1
1469             lastCombining = 0;
1470             ++pos;
1471             continue;
1472         }
1473         int combining = p->combiningClass;
1474         if (starter == pos - 1 || combining > lastCombining) {
1475             // allowed to form ligature with S
1476             QChar ligature = ligatureHelper(s.at(starter).unicode(), uc);
1477             if (ligature.unicode()) {
1478                 s[starter] = ligature;
1479                 s.remove(pos, 1);
1480                 continue;
1481             }
1482         }
1483         if (!combining)
1484             starter = pos;
1485         lastCombining = combining;
1486         ++pos;
1487     }
1488 }
1489
1490
1491 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1492 {
1493     QString &s = *str;
1494     const int l = s.length()-1;
1495     int pos = from;
1496     while (pos < l) {
1497         int p2 = pos+1;
1498         uint u1 = s.at(pos).unicode();
1499         if (QChar(u1).isHighSurrogate()) {
1500             ushort low = s.at(p2).unicode();
1501             if (QChar(low).isLowSurrogate()) {
1502                 u1 = QChar::surrogateToUcs4(u1, low);
1503                 if (p2 >= l)
1504                     break;
1505                 ++p2;
1506             }
1507         }
1508         uint u2 = s.at(p2).unicode();
1509         if (QChar(u2).isHighSurrogate() && p2 < l) {
1510             ushort low = s.at(p2+1).unicode();
1511             if (QChar(low).isLowSurrogate()) {
1512                 u2 = QChar::surrogateToUcs4(u2, low);
1513                 ++p2;
1514             }
1515         }
1516
1517         ushort c2 = 0;
1518         {
1519             const QUnicodeTables::Properties *p = qGetProp(u2);
1520             if (p->unicodeVersion <= version && p->unicodeVersion != QChar::Unicode_Unassigned)
1521                 c2 = p->combiningClass;
1522         }
1523         if (c2 == 0) {
1524             pos = p2+1;
1525             continue;
1526         }
1527
1528         ushort c1 = 0;
1529         {
1530             const QUnicodeTables::Properties *p = qGetProp(u1);
1531             if (p->unicodeVersion <= version && p->unicodeVersion != QChar::Unicode_Unassigned)
1532                 c1 = p->combiningClass;
1533         }
1534
1535         if (c1 > c2) {
1536             QChar *uc = s.data();
1537             int p = pos;
1538             // exchange characters
1539             if (!QChar::requiresSurrogates(u2)) {
1540                 uc[p++] = u2;
1541             } else {
1542                 uc[p++] = QChar::highSurrogate(u2);
1543                 uc[p++] = QChar::lowSurrogate(u2);
1544             }
1545             if (!QChar::requiresSurrogates(u1)) {
1546                 uc[p++] = u1;
1547             } else {
1548                 uc[p++] = QChar::highSurrogate(u1);
1549                 uc[p++] = QChar::lowSurrogate(u1);
1550             }
1551             if (pos > 0)
1552                 --pos;
1553             if (pos > 0 && s.at(pos).isLowSurrogate())
1554                 --pos;
1555         } else {
1556             ++pos;
1557             if (QChar::requiresSurrogates(u1))
1558                 ++pos;
1559         }
1560     }
1561 }
1562
1563 QT_END_NAMESPACE