src/corelib/tools/qchar.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 // Don't define it while compiling this module, or USERS of Qt will
  43 // not be able to link.
  44 #ifdef QT_NO_CAST_FROM_ASCII
  45 #  undef QT_NO_CAST_FROM_ASCII
  46 #endif
  47 #ifdef QT_NO_CAST_TO_ASCII
  48 #  undef QT_NO_CAST_TO_ASCII
  49 #endif
  50 #include "qchar.h"
  51
  52 #include "qdatastream.h"
  53
  54 #include "qunicodetables_p.h"
  55 #include "qunicodetables.cpp"
  56
  57 QT_BEGIN_NAMESPACE
  58
  59 #define FLAG(x) (1 << (x))
  60
  61 /*!
  62     \class QLatin1Char
  63     \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
  64
  65     \ingroup string-processing
  66
  67     This class is only useful to construct a QChar with 8-bit character.
  68
  69     \sa QChar, QLatin1String, QString
  70 */
  71
  72 /*!
  73     \fn const char QLatin1Char::toLatin1() const
  74
  75     Converts a Latin-1 character to an 8-bit ASCII representation of the character.
  76 */
  77
  78 /*!
  79     \fn const ushort QLatin1Char::unicode() const
  80
  81     Converts a Latin-1 character to an 16-bit-encoded Unicode representation
  82     of the character.
  83 */
  84
  85 /*!
  86     \fn QLatin1Char::QLatin1Char(char c)
  87
  88     Constructs a Latin-1 character for \a c. This constructor should be
  89     used when the encoding of the input character is known to be Latin-1.
  90 */
  91
  92 /*!
  93     \class QChar
  94     \brief The QChar class provides a 16-bit Unicode character.
  95
  96     \ingroup string-processing
  97     \reentrant
  98
  99     In Qt, Unicode characters are 16-bit entities without any markup
 100     or structure. This class represents such an entity. It is
 101     lightweight, so it can be used everywhere. Most compilers treat
 102     it like a \c{unsigned short}.
 103
 104     QChar provides a full complement of testing/classification
 105     functions, converting to and from other formats, converting from
 106     composed to decomposed Unicode, and trying to compare and
 107     case-convert if you ask it to.
 108
 109     The classification functions include functions like those in the
 110     standard C++ header \<cctype\> (formerly \<ctype.h\>), but
 111     operating on the full range of Unicode characters, not just for the ASCII
 112     range. They all return true if the character is a certain type of character;
 113     otherwise they return false. These classification functions are
 114     isNull() (returns true if the character is '\\0'), isPrint()
 115     (true if the character is any sort of printable character,
 116     including whitespace), isPunct() (any sort of punctation),
 117     isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
 118     sort of numeric character, not just 0-9), isLetterOrNumber(), and
 119     isDigit() (decimal digits). All of these are wrappers around
 120     category() which return the Unicode-defined category of each
 121     character. Some of these also calculate the derived properties
 122     (i.e. isSpace() returns true if the character is of category
 123     Separator_* or an exceptional code point from Other_Control category).
 124
 125     QChar also provides direction(), which indicates the "natural"
 126     writing direction of this character. The joining() function
 127     indicates how the character joins with it's neighbors (needed
 128     mostly for Arabic) and finally hasMirrored(), which indicates
 129     whether the character needs to be mirrored when it is printed in
 130     it's "unnatural" writing direction.
 131
 132     Composed Unicode characters (like \a ring) can be converted to
 133     decomposed Unicode ("a" followed by "ring above") by using decomposition().
 134
 135     In Unicode, comparison is not necessarily possible and case
 136     conversion is very difficult at best. Unicode, covering the
 137     "entire" world, also includes most of the world's case and
 138     sorting problems. operator==() and friends will do comparison
 139     based purely on the numeric Unicode value (code point) of the
 140     characters, and toUpper() and toLower() will do case changes when
 141     the character has a well-defined uppercase/lowercase equivalent.
 142     For locale-dependent comparisons, use QString::localeAwareCompare().
 143
 144     The conversion functions include unicode() (to a scalar),
 145     toLatin1() (to scalar, but converts all non-Latin-1 characters to
 146     0), row() (gives the Unicode row), cell() (gives the Unicode
 147     cell), digitValue() (gives the integer value of any of the
 148     numerous digit characters), and a host of constructors.
 149
 150     QChar provides constructors and cast operators that make it easy
 151     to convert to and from traditional 8-bit \c{char}s. If you
 152     defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
 153     explained in the QString documentation, you will need to
 154     explicitly call fromLatin1(), or use QLatin1Char,
 155     to construct a QChar from an 8-bit \c char, and you will need to
 156     call toLatin1() to get the 8-bit value back.
 157
 158     For more information see
 159     \l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
 160
 161     \sa Unicode, QString, QLatin1Char
 162 */
 163
 164 /*!
 165     \enum QChar::UnicodeVersion
 166
 167     Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
 168     introduced a certain character.
 169
 170     \value Unicode_1_1  Version 1.1
 171     \value Unicode_2_0  Version 2.0
 172     \value Unicode_2_1_2  Version 2.1.2
 173     \value Unicode_3_0  Version 3.0
 174     \value Unicode_3_1  Version 3.1
 175     \value Unicode_3_2  Version 3.2
 176     \value Unicode_4_0  Version 4.0
 177     \value Unicode_4_1  Version 4.1
 178     \value Unicode_5_0  Version 5.0
 179     \value Unicode_5_1  Version 5.1
 180     \value Unicode_5_2  Version 5.2
 181     \value Unicode_6_0  Version 6.0
 182     \value Unicode_6_1  Version 6.1
 183     \value Unicode_Unassigned  The value is not assigned to any character
 184         in version 6.1 of Unicode.
 185
 186     \sa unicodeVersion(), currentUnicodeVersion()
 187 */
 188
 189 /*!
 190     \enum QChar::Category
 191
 192     This enum maps the Unicode character categories.
 193
 194     The following characters are normative in Unicode:
 195
 196     \value Mark_NonSpacing  Unicode class name Mn
 197
 198     \value Mark_SpacingCombining  Unicode class name Mc
 199
 200     \value Mark_Enclosing  Unicode class name Me
 201
 202     \value Number_DecimalDigit  Unicode class name Nd
 203
 204     \value Number_Letter  Unicode class name Nl
 205
 206     \value Number_Other  Unicode class name No
 207
 208     \value Separator_Space  Unicode class name Zs
 209
 210     \value Separator_Line  Unicode class name Zl
 211
 212     \value Separator_Paragraph  Unicode class name Zp
 213
 214     \value Other_Control  Unicode class name Cc
 215
 216     \value Other_Format  Unicode class name Cf
 217
 218     \value Other_Surrogate  Unicode class name Cs
 219
 220     \value Other_PrivateUse  Unicode class name Co
 221
 222     \value Other_NotAssigned  Unicode class name Cn
 223
 224
 225     The following categories are informative in Unicode:
 226
 227     \value Letter_Uppercase  Unicode class name Lu
 228
 229     \value Letter_Lowercase  Unicode class name Ll
 230
 231     \value Letter_Titlecase  Unicode class name Lt
 232
 233     \value Letter_Modifier  Unicode class name Lm
 234
 235     \value Letter_Other Unicode class name Lo
 236
 237     \value Punctuation_Connector  Unicode class name Pc
 238
 239     \value Punctuation_Dash  Unicode class name Pd
 240
 241     \value Punctuation_Open  Unicode class name Ps
 242
 243     \value Punctuation_Close  Unicode class name Pe
 244
 245     \value Punctuation_InitialQuote  Unicode class name Pi
 246
 247     \value Punctuation_FinalQuote  Unicode class name Pf
 248
 249     \value Punctuation_Other  Unicode class name Po
 250
 251     \value Symbol_Math  Unicode class name Sm
 252
 253     \value Symbol_Currency  Unicode class name Sc
 254
 255     \value Symbol_Modifier  Unicode class name Sk
 256
 257     \value Symbol_Other  Unicode class name So
 258
 259     \sa category()
 260 */
 261
 262 /*!
 263     \enum QChar::Direction
 264
 265     This enum type defines the Unicode direction attributes. See the
 266     \l{http://www.unicode.org/}{Unicode Standard} for a description
 267     of the values.
 268
 269     In order to conform to C/C++ naming conventions "Dir" is prepended
 270     to the codes used in the Unicode Standard.
 271
 272     \value DirAL
 273     \value DirAN
 274     \value DirB
 275     \value DirBN
 276     \value DirCS
 277     \value DirEN
 278     \value DirES
 279     \value DirET
 280     \value DirL
 281     \value DirLRE
 282     \value DirLRO
 283     \value DirNSM
 284     \value DirON
 285     \value DirPDF
 286     \value DirR
 287     \value DirRLE
 288     \value DirRLO
 289     \value DirS
 290     \value DirWS
 291
 292     \sa direction()
 293 */
 294
 295 /*!
 296     \enum QChar::Decomposition
 297
 298     This enum type defines the Unicode decomposition attributes. See
 299     the \l{http://www.unicode.org/}{Unicode Standard} for a
 300     description of the values.
 301
 302     \value NoDecomposition
 303     \value Canonical
 304     \value Circle
 305     \value Compat
 306     \value Final
 307     \value Font
 308     \value Fraction
 309     \value Initial
 310     \value Isolated
 311     \value Medial
 312     \value Narrow
 313     \value NoBreak
 314     \value Small
 315     \value Square
 316     \value Sub
 317     \value Super
 318     \value Vertical
 319     \value Wide
 320
 321     \sa decomposition()
 322 */
 323
 324 /*!
 325     \enum QChar::Joining
 326
 327     This enum type defines the Unicode joining attributes. See the
 328     \l{http://www.unicode.org/}{Unicode Standard} for a description
 329     of the values.
 330
 331     \value Center
 332     \value Dual
 333     \value OtherJoining
 334     \value Right
 335
 336     \sa joining()
 337 */
 338
 339 /*!
 340     \enum QChar::CombiningClass
 341
 342     \internal
 343
 344     This enum type defines names for some of the Unicode combining
 345     classes. See the \l{http://www.unicode.org/}{Unicode Standard}
 346     for a description of the values.
 347
 348     \value Combining_Above
 349     \value Combining_AboveAttached
 350     \value Combining_AboveLeft
 351     \value Combining_AboveLeftAttached
 352     \value Combining_AboveRight
 353     \value Combining_AboveRightAttached
 354     \value Combining_Below
 355     \value Combining_BelowAttached
 356     \value Combining_BelowLeft
 357     \value Combining_BelowLeftAttached
 358     \value Combining_BelowRight
 359     \value Combining_BelowRightAttached
 360     \value Combining_DoubleAbove
 361     \value Combining_DoubleBelow
 362     \value Combining_IotaSubscript
 363     \value Combining_Left
 364     \value Combining_LeftAttached
 365     \value Combining_Right
 366     \value Combining_RightAttached
 367 */
 368
 369 /*!
 370     \enum QChar::SpecialCharacter
 371
 372     \value Null A QChar with this value isNull().
 373     \value Tabulation Character tabulation.
 374     \value LineFeed
 375     \value CarriageReturn
 376     \value Space
 377     \value Nbsp Non-breaking space.
 378     \value ReplacementCharacter The character shown when a font has no glyph
 379            for a certain codepoint. A special question mark character is often
 380            used. Codecs use this codepoint when input data cannot be
 381            represented in Unicode.
 382     \value ObjectReplacementCharacter Used to represent an object such as an
 383            image when such objects cannot be presented.
 384     \value ByteOrderMark
 385     \value ByteOrderSwapped
 386     \value ParagraphSeparator
 387     \value LineSeparator
 388     \value LastValidCodePoint
 389 */
 390
 391 /*!
 392     \fn void QChar::setCell(uchar cell)
 393     \internal
 394 */
 395
 396 /*!
 397     \fn void QChar::setRow(uchar row)
 398     \internal
 399 */
 400
 401 /*!
 402     \fn QChar::QChar()
 403
 404     Constructs a null QChar ('\\0').
 405
 406     \sa isNull()
 407 */
 408
 409 /*!
 410     \fn QChar::QChar(QLatin1Char ch)
 411
 412     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 413 */
 414
 415 /*!
 416     \fn QChar::QChar(SpecialCharacter ch)
 417
 418     Constructs a QChar for the predefined character value \a ch.
 419 */
 420
 421 /*!
 422     \fn QChar::QChar(char ch)
 423
 424     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 425 */
 426
 427 /*!
 428     \fn QChar::QChar(uchar ch)
 429
 430     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
 431 */
 432
 433 /*!
 434     \fn QChar::QChar(uchar cell, uchar row)
 435
 436     Constructs a QChar for Unicode cell \a cell in row \a row.
 437
 438     \sa cell(), row()
 439 */
 440
 441 /*!
 442     \fn QChar::QChar(ushort code)
 443
 444     Constructs a QChar for the character with Unicode code point \a code.
 445 */
 446
 447 /*!
 448     \fn QChar::QChar(short code)
 449
 450     Constructs a QChar for the character with Unicode code point \a code.
 451 */
 452
 453 /*!
 454     \fn QChar::QChar(uint code)
 455
 456     Constructs a QChar for the character with Unicode code point \a code.
 457 */
 458
 459 /*!
 460     \fn QChar::QChar(int code)
 461
 462     Constructs a QChar for the character with Unicode code point \a code.
 463 */
 464
 465 /*!
 466     \fn bool QChar::isNull() const
 467
 468     Returns true if the character is the Unicode character 0x0000
 469     ('\\0'); otherwise returns false.
 470 */
 471
 472 /*!
 473     \fn uchar QChar::cell() const
 474
 475     Returns the cell (least significant byte) of the Unicode character.
 476
 477     \sa row()
 478 */
 479
 480 /*!
 481     \fn uchar QChar::row() const
 482
 483     Returns the row (most significant byte) of the Unicode character.
 484
 485     \sa cell()
 486 */
 487
 488 /*!
 489     \fn bool QChar::isPrint() const
 490
 491     Returns true if the character is a printable character; otherwise
 492     returns false. This is any character not of category Other_*.
 493
 494     Note that this gives no indication of whether the character is
 495     available in a particular font.
 496 */
 497
 498 /*!
 499     \overload
 500     \since 5.0
 501
 502     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 503     a printable character; otherwise returns false.
 504     This is any character not of category Other_*.
 505
 506     Note that this gives no indication of whether the character is
 507     available in a particular font.
 508 */
 509 bool QChar::isPrint(uint ucs4)
 510 {
 511     if (ucs4 > LastValidCodePoint)
 512         return false;
 513     const int test = FLAG(Other_Control) |
 514                      FLAG(Other_Format) |
 515                      FLAG(Other_Surrogate) |
 516                      FLAG(Other_PrivateUse) |
 517                      FLAG(Other_NotAssigned);
 518     return !(FLAG(qGetProp(ucs4)->category) & test);
 519 }
 520
 521 /*!
 522     \fn bool QChar::isSpace() const
 523
 524     Returns true if the character is a separator character
 525     (Separator_* categories or certain code points from Other_Control category);
 526     otherwise returns false.
 527 */
 528
 529 /*!
 530     \fn bool QChar::isSpace(uint ucs4)
 531     \overload
 532     \since 5.0
 533
 534     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 535     a separator character (Separator_* categories or certain code points
 536     from Other_Control category); otherwise returns false.
 537 */
 538
 539 /*!
 540     \internal
 541 */
 542 bool QT_FASTCALL QChar::isSpace_helper(uint ucs4)
 543 {
 544     if (ucs4 > LastValidCodePoint)
 545         return false;
 546     const int test = FLAG(Separator_Space) |
 547                      FLAG(Separator_Line) |
 548                      FLAG(Separator_Paragraph);
 549     return FLAG(qGetProp(ucs4)->category) & test;
 550 }
 551
 552 /*!
 553     \fn bool QChar::isMark() const
 554
 555     Returns true if the character is a mark (Mark_* categories);
 556     otherwise returns false.
 557
 558     See QChar::Category for more information regarding marks.
 559 */
 560
 561 /*!
 562     \overload
 563     \since 5.0
 564
 565     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 566     a mark (Mark_* categories); otherwise returns false.
 567 */
 568 bool QChar::isMark(uint ucs4)
 569 {
 570     if (ucs4 > LastValidCodePoint)
 571         return false;
 572     const int test = FLAG(Mark_NonSpacing) |
 573                      FLAG(Mark_SpacingCombining) |
 574                      FLAG(Mark_Enclosing);
 575     return FLAG(qGetProp(ucs4)->category) & test;
 576 }
 577
 578 /*!
 579     \fn bool QChar::isPunct() const
 580
 581     Returns true if the character is a punctuation mark (Punctuation_*
 582     categories); otherwise returns false.
 583 */
 584
 585 /*!
 586     \overload
 587     \since 5.0
 588
 589     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 590     a punctuation mark (Punctuation_* categories); otherwise returns false.
 591 */
 592 bool QChar::isPunct(uint ucs4)
 593 {
 594     if (ucs4 > LastValidCodePoint)
 595         return false;
 596     const int test = FLAG(Punctuation_Connector) |
 597                      FLAG(Punctuation_Dash) |
 598                      FLAG(Punctuation_Open) |
 599                      FLAG(Punctuation_Close) |
 600                      FLAG(Punctuation_InitialQuote) |
 601                      FLAG(Punctuation_FinalQuote) |
 602                      FLAG(Punctuation_Other);
 603     return FLAG(qGetProp(ucs4)->category) & test;
 604 }
 605
 606 /*!
 607     \fn bool QChar::isSymbol() const
 608
 609     Returns true if the character is a symbol (Symbol_* categories);
 610     otherwise returns false.
 611 */
 612
 613 /*!
 614     \overload
 615     \since 5.0
 616
 617     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 618     a symbol (Symbol_* categories); otherwise returns false.
 619 */
 620 bool QChar::isSymbol(uint ucs4)
 621 {
 622     if (ucs4 > LastValidCodePoint)
 623         return false;
 624     const int test = FLAG(Symbol_Math) |
 625                      FLAG(Symbol_Currency) |
 626                      FLAG(Symbol_Modifier) |
 627                      FLAG(Symbol_Other);
 628     return FLAG(qGetProp(ucs4)->category) & test;
 629 }
 630
 631 /*!
 632     \fn bool QChar::isLetter() const
 633
 634     Returns true if the character is a letter (Letter_* categories);
 635     otherwise returns false.
 636 */
 637
 638 /*!
 639     \fn bool QChar::isLetter(uint ucs4)
 640     \overload
 641     \since 5.0
 642
 643     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 644     a letter (Letter_* categories); otherwise returns false.
 645 */
 646
 647 /*!
 648     \internal
 649 */
 650 bool QT_FASTCALL QChar::isLetter_helper(uint ucs4)
 651 {
 652     if (ucs4 > LastValidCodePoint)
 653         return false;
 654     const int test = FLAG(Letter_Uppercase) |
 655                      FLAG(Letter_Lowercase) |
 656                      FLAG(Letter_Titlecase) |
 657                      FLAG(Letter_Modifier) |
 658                      FLAG(Letter_Other);
 659     return FLAG(qGetProp(ucs4)->category) & test;
 660 }
 661
 662 /*!
 663     \fn bool QChar::isNumber() const
 664
 665     Returns true if the character is a number (Number_* categories,
 666     not just 0-9); otherwise returns false.
 667
 668     \sa isDigit()
 669 */
 670
 671 /*!
 672     \fn bool QChar::isNumber(uint ucs4)
 673     \overload
 674     \since 5.0
 675
 676     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 677     a number (Number_* categories, not just 0-9); otherwise returns false.
 678
 679     \sa isDigit()
 680 */
 681
 682 /*!
 683     \internal
 684 */
 685 bool QT_FASTCALL QChar::isNumber_helper(uint ucs4)
 686 {
 687     if (ucs4 > LastValidCodePoint)
 688         return false;
 689     const int test = FLAG(Number_DecimalDigit) |
 690                      FLAG(Number_Letter) |
 691                      FLAG(Number_Other);
 692     return FLAG(qGetProp(ucs4)->category) & test;
 693 }
 694
 695 /*!
 696     \fn bool QChar::isLetterOrNumber() const
 697
 698     Returns true if the character is a letter or number (Letter_* or
 699     Number_* categories); otherwise returns false.
 700 */
 701
 702 /*!
 703     \fn bool QChar::isLetterOrNumber(uint ucs4)
 704     \overload
 705     \since 5.0
 706
 707     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 708     a letter or number (Letter_* or Number_* categories); otherwise returns false.
 709 */
 710
 711 /*!
 712     \internal
 713 */
 714 bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4)
 715 {
 716     if (ucs4 > LastValidCodePoint)
 717         return false;
 718     const int test = FLAG(Letter_Uppercase) |
 719                      FLAG(Letter_Lowercase) |
 720                      FLAG(Letter_Titlecase) |
 721                      FLAG(Letter_Modifier) |
 722                      FLAG(Letter_Other) |
 723                      FLAG(Number_DecimalDigit) |
 724                      FLAG(Number_Letter) |
 725                      FLAG(Number_Other);
 726     return FLAG(qGetProp(ucs4)->category) & test;
 727 }
 728
 729 /*!
 730     \fn bool QChar::isDigit() const
 731
 732     Returns true if the character is a decimal digit
 733     (Number_DecimalDigit); otherwise returns false.
 734
 735     \sa isNumber()
 736 */
 737
 738 /*!
 739     \fn bool QChar::isDigit(uint ucs4)
 740     \overload
 741     \since 5.0
 742
 743     Returns true if the UCS-4-encoded character specified by \a ucs4 is
 744     a decimal digit (Number_DecimalDigit); otherwise returns false.
 745
 746     \sa isNumber()
 747 */
 748
 749 /*!
 750     \fn bool QChar::isNonCharacter() const
 751     \since 5.0
 752
 753     Returns true if the QChar is a non-character; false otherwise.
 754
 755     Unicode has a certain number of code points that are classified
 756     as "non-characters:" that is, they can be used for internal purposes
 757     in applications but cannot be used for text interchange.
 758     Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
 759     [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
 760 */
 761
 762 /*!
 763     \fn bool QChar::isHighSurrogate() const
 764
 765     Returns true if the QChar is the high part of a UTF16 surrogate
 766     (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise.
 767 */
 768
 769 /*!
 770     \fn bool QChar::isLowSurrogate() const
 771
 772     Returns true if the QChar is the low part of a UTF16 surrogate
 773     (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise.
 774 */
 775
 776 /*!
 777     \fn bool QChar::isSurrogate() const
 778     \since 5.0
 779
 780     Returns true if the QChar contains a code point that is in either
 781     the high or the low part of the UTF-16 surrogate range
 782     (i.e. if its code point is in range [0xd800..0xdfff]); false otherwise.
 783 */
 784
 785 /*!
 786     \fn static bool isNonCharacter(uint ucs4)
 787     \overload
 788     \since 5.0
 789
 790     Returns true if the UCS-4-encoded character specified by \a ucs4
 791     is a non-character; false otherwise.
 792
 793     Unicode has a certain number of code points that are classified
 794     as "non-characters:" that is, they can be used for internal purposes
 795     in applications but cannot be used for text interchange.
 796     Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
 797     [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
 798 */
 799
 800 /*!
 801     \fn static bool QChar::isHighSurrogate(uint ucs4)
 802     \overload
 803
 804     Returns true if the UCS-4-encoded character specified by \a ucs4
 805     is the high part of a UTF16 surrogate
 806     (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise.
 807 */
 808
 809 /*!
 810     \fn static bool QChar::isLowSurrogate(uint ucs4)
 811     \overload
 812
 813     Returns true if the UCS-4-encoded character specified by \a ucs4
 814     is the low part of a UTF16 surrogate
 815     (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise.
 816 */
 817
 818 /*!
 819     \fn static bool QChar::isSurrogate(uint ucs4)
 820     \overload
 821     \since 5.0
 822
 823     Returns true if the UCS-4-encoded character specified by \a ucs4
 824     contains a code point that is in either the high or the low part of the
 825     UTF-16 surrogate range (i.e. if its code point is in range [0xd800..0xdfff]);
 826     false otherwise.
 827 */
 828
 829 /*!
 830     \fn static bool QChar::requiresSurrogates(uint ucs4)
 831
 832     Returns true if the UCS-4-encoded character specified by \a ucs4
 833     can be split into the high and low parts of a UTF16 surrogate
 834     (i.e. if its code point is greater than or equals to 0x10000);
 835     false otherwise.
 836 */
 837
 838 /*!
 839     \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
 840
 841     Converts a UTF16 surrogate pair with the given \a high and \a low values
 842     to it's UCS-4-encoded code point.
 843 */
 844
 845 /*!
 846     \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
 847     \overload
 848
 849     Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
 850 */
 851
 852 /*!
 853     \fn static ushort QChar::highSurrogate(uint ucs4)
 854
 855     Returns the high surrogate part of a UCS-4-encoded code point.
 856     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 857 */
 858
 859 /*!
 860     \fn static ushort QChar::lowSurrogate(uint ucs4)
 861
 862     Returns the low surrogate part of a UCS-4-encoded code point.
 863     The returned result is undefined if \a ucs4 is smaller than 0x10000.
 864 */
 865
 866 /*!
 867     \fn int QChar::digitValue() const
 868
 869     Returns the numeric value of the digit, or -1 if the character is not a digit.
 870 */
 871
 872 /*!
 873     \overload
 874     Returns the numeric value of the digit specified by the UCS-4-encoded
 875     character, \a ucs4, or -1 if the character is not a digit.
 876 */
 877 int QChar::digitValue(uint ucs4)
 878 {
 879     if (ucs4 > LastValidCodePoint)
 880         return -1;
 881     return qGetProp(ucs4)->digitValue;
 882 }
 883
 884 /*!
 885     \fn QChar::Category QChar::category() const
 886
 887     Returns the character's category.
 888 */
 889
 890 /*!
 891     \overload
 892     Returns the category of the UCS-4-encoded character specified by \a ucs4.
 893 */
 894 QChar::Category QChar::category(uint ucs4)
 895 {
 896     if (ucs4 > LastValidCodePoint)
 897         return QChar::Other_NotAssigned;
 898     return (QChar::Category) qGetProp(ucs4)->category;
 899 }
 900
 901 /*!
 902     \fn QChar::Direction QChar::direction() const
 903
 904     Returns the character's direction.
 905 */
 906
 907 /*!
 908     \overload
 909     Returns the direction of the UCS-4-encoded character specified by \a ucs4.
 910 */
 911 QChar::Direction QChar::direction(uint ucs4)
 912 {
 913     if (ucs4 > LastValidCodePoint)
 914         return QChar::DirL;
 915     return (QChar::Direction) qGetProp(ucs4)->direction;
 916 }
 917
 918 /*!
 919     \fn QChar::Joining QChar::joining() const
 920
 921     Returns information about the joining properties of the character
 922     (needed for certain languages such as Arabic).
 923 */
 924
 925 /*!
 926     \overload
 927     Returns information about the joining properties of the UCS-4-encoded
 928     character specified by \a ucs4 (needed for certain languages such as Arabic).
 929 */
 930 QChar::Joining QChar::joining(uint ucs4)
 931 {
 932     if (ucs4 > LastValidCodePoint)
 933         return QChar::OtherJoining;
 934     return (QChar::Joining) qGetProp(ucs4)->joining;
 935 }
 936
 937 /*!
 938     \fn bool QChar::hasMirrored() const
 939
 940     Returns true if the character should be reversed if the text
 941     direction is reversed; otherwise returns false.
 942
 943     A bit faster equivalent of (ch.mirroredChar() != ch).
 944
 945     \sa mirroredChar()
 946 */
 947
 948 /*!
 949     \overload
 950     \since 5.0
 951
 952     Returns true if the UCS-4-encoded character specified by \a ucs4
 953     should be reversed if the text direction is reversed; otherwise returns false.
 954
 955     A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
 956
 957     \sa mirroredChar()
 958 */
 959 bool QChar::hasMirrored(uint ucs4)
 960 {
 961     if (ucs4 > LastValidCodePoint)
 962         return false;
 963     return qGetProp(ucs4)->mirrorDiff != 0;
 964 }
 965
 966 /*!
 967     \fn bool QChar::isLower() const
 968
 969     Returns true if the character is a lowercase letter, i.e.
 970     category() is Letter_Lowercase.
 971
 972     \sa isUpper(), toLower(), toUpper()
 973 */
 974
 975 /*!
 976     \fn bool QChar::isUpper() const
 977
 978     Returns true if the character is an uppercase letter, i.e.
 979     category() is Letter_Uppercase.
 980
 981     \sa isLower(), toUpper(), toLower()
 982 */
 983
 984 /*!
 985     \fn bool QChar::isTitleCase() const
 986
 987     Returns true if the character is a titlecase letter, i.e.
 988     category() is Letter_Titlecase.
 989
 990     \sa isLower(), toUpper(), toLower(), toTitleCase()
 991 */
 992
 993 /*!
 994     \fn QChar QChar::mirroredChar() const
 995
 996     Returns the mirrored character if this character is a mirrored
 997     character; otherwise returns the character itself.
 998
 999     \sa hasMirrored()
1000 */
1001
1002 /*!
1003     \overload
1004     Returns the mirrored character if the UCS-4-encoded character specified
1005     by \a ucs4 is a mirrored character; otherwise returns the character itself.
1006
1007     \sa hasMirrored()
1008 */
1009 uint QChar::mirroredChar(uint ucs4)
1010 {
1011     if (ucs4 > LastValidCodePoint)
1012         return ucs4;
1013     return ucs4 + qGetProp(ucs4)->mirrorDiff;
1014 }
1015
1016
1017 // constants for Hangul (de)composition, see UAX #15
1018 enum {
1019     Hangul_SBase = 0xac00,
1020     Hangul_LBase = 0x1100,
1021     Hangul_VBase = 0x1161,
1022     Hangul_TBase = 0x11a7,
1023     Hangul_LCount = 19,
1024     Hangul_VCount = 21,
1025     Hangul_TCount = 28,
1026     Hangul_NCount = Hangul_VCount * Hangul_TCount,
1027     Hangul_SCount = Hangul_LCount * Hangul_NCount
1028 };
1029
1030 // buffer has to have a length of 3. It's needed for Hangul decomposition
1031 static const unsigned short * QT_FASTCALL decompositionHelper
1032     (uint ucs4, int *length, int *tag, unsigned short *buffer)
1033 {
1034     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1035         // compute Hangul syllable decomposition as per UAX #15
1036         const uint SIndex = ucs4 - Hangul_SBase;
1037         buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
1038         buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1039         buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
1040         *length = buffer[2] == Hangul_TBase ? 2 : 3;
1041         *tag = QChar::Canonical;
1042         return buffer;
1043     }
1044
1045     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1046     if (index == 0xffff) {
1047         *length = 0;
1048         *tag = QChar::NoDecomposition;
1049         return 0;
1050     }
1051
1052     const unsigned short *decomposition = uc_decomposition_map+index;
1053     *tag = (*decomposition) & 0xff;
1054     *length = (*decomposition) >> 8;
1055     return decomposition+1;
1056 }
1057
1058 /*!
1059     Decomposes a character into it's constituent parts. Returns an empty string
1060     if no decomposition exists.
1061 */
1062 QString QChar::decomposition() const
1063 {
1064     return QChar::decomposition(ucs);
1065 }
1066
1067 /*!
1068     \overload
1069     Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1070     constituent parts. Returns an empty string if no decomposition exists.
1071 */
1072 QString QChar::decomposition(uint ucs4)
1073 {
1074     unsigned short buffer[3];
1075     int length;
1076     int tag;
1077     const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1078     return QString(reinterpret_cast<const QChar *>(d), length);
1079 }
1080
1081 /*!
1082     \fn QChar::Decomposition QChar::decompositionTag() const
1083
1084     Returns the tag defining the composition of the character. Returns
1085     QChar::NoDecomposition if no decomposition exists.
1086 */
1087
1088 /*!
1089     \overload
1090     Returns the tag defining the composition of the UCS-4-encoded character
1091     specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1092 */
1093 QChar::Decomposition QChar::decompositionTag(uint ucs4)
1094 {
1095     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1096         return QChar::Canonical;
1097     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1098     if (index == 0xffff)
1099         return QChar::NoDecomposition;
1100     return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1101 }
1102
1103 /*!
1104     \fn unsigned char QChar::combiningClass() const
1105
1106     Returns the combining class for the character as defined in the
1107     Unicode standard. This is mainly useful as a positioning hint for
1108     marks attached to a base character.
1109
1110     The Qt text rendering engine uses this information to correctly
1111     position non-spacing marks around a base character.
1112 */
1113
1114 /*!
1115     \overload
1116     Returns the combining class for the UCS-4-encoded character specified by
1117     \a ucs4, as defined in the Unicode standard.
1118 */
1119 unsigned char QChar::combiningClass(uint ucs4)
1120 {
1121     if (ucs4 > LastValidCodePoint)
1122         return 0;
1123     return (unsigned char) qGetProp(ucs4)->combiningClass;
1124 }
1125
1126 /*!
1127     \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1128
1129     Returns the Unicode version that introduced this character.
1130 */
1131
1132 /*!
1133     \overload
1134     Returns the Unicode version that introduced the character specified in
1135     its UCS-4-encoded form as \a ucs4.
1136 */
1137 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
1138 {
1139     if (ucs4 > LastValidCodePoint)
1140         return QChar::Unicode_Unassigned;
1141     return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1142 }
1143
1144 /*!
1145     Returns the most recent supported Unicode version.
1146 */
1147 QChar::UnicodeVersion QChar::currentUnicodeVersion()
1148 {
1149     return UNICODE_DATA_VERSION;
1150 }
1151
1152
1153 template <typename T>
1154 static inline T toLowerCase_helper(T uc)
1155 {
1156     const QUnicodeTables::Properties *p = qGetProp(uc);
1157     if (p->lowerCaseSpecial) {
1158         const ushort *specialCase = specialCaseMap + p->lowerCaseDiff;
1159         return (*specialCase == 1) ? specialCase[1] : uc;
1160     }
1161     return uc + p->lowerCaseDiff;
1162 }
1163
1164 template <typename T>
1165 static inline T toUpperCase_helper(T uc)
1166 {
1167     const QUnicodeTables::Properties *p = qGetProp(uc);
1168     if (p->upperCaseSpecial) {
1169         const ushort *specialCase = specialCaseMap + p->upperCaseDiff;
1170         return (*specialCase == 1) ? specialCase[1] : uc;
1171     }
1172     return uc + p->upperCaseDiff;
1173 }
1174
1175 template <typename T>
1176 static inline T toTitleCase_helper(T uc)
1177 {
1178     const QUnicodeTables::Properties *p = qGetProp(uc);
1179     if (p->titleCaseSpecial) {
1180         const ushort *specialCase = specialCaseMap + p->titleCaseDiff;
1181         return (*specialCase == 1) ? specialCase[1] : uc;
1182     }
1183     return uc + p->titleCaseDiff;
1184 }
1185
1186 template <typename T>
1187 static inline T toCaseFolded_helper(T uc)
1188 {
1189     const QUnicodeTables::Properties *p = qGetProp(uc);
1190     if (p->caseFoldSpecial) {
1191         const ushort *specialCase = specialCaseMap + p->caseFoldDiff;
1192         return (*specialCase == 1) ? specialCase[1] : uc;
1193     }
1194     return uc + p->caseFoldDiff;
1195 }
1196
1197 /*!
1198     \fn QChar QChar::toLower() const
1199
1200     Returns the lowercase equivalent if the character is uppercase or titlecase;
1201     otherwise returns the character itself.
1202 */
1203
1204 /*!
1205     \overload
1206     Returns the lowercase equivalent of the UCS-4-encoded character specified
1207     by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1208     the character itself.
1209 */
1210 uint QChar::toLower(uint ucs4)
1211 {
1212     if (ucs4 > LastValidCodePoint)
1213         return ucs4;
1214     return toLowerCase_helper<uint>(ucs4);
1215 }
1216
1217 /*!
1218     \fn QChar QChar::toUpper() const
1219
1220     Returns the uppercase equivalent if the character is lowercase or titlecase;
1221     otherwise returns the character itself.
1222 */
1223
1224 /*!
1225     \overload
1226     Returns the uppercase equivalent of the UCS-4-encoded character specified
1227     by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1228     the character itself.
1229 */
1230 uint QChar::toUpper(uint ucs4)
1231 {
1232     if (ucs4 > LastValidCodePoint)
1233         return ucs4;
1234     return toUpperCase_helper<uint>(ucs4);
1235 }
1236
1237 /*!
1238     \fn QChar QChar::toTitleCase() const
1239
1240     Returns the title case equivalent if the character is lowercase or uppercase;
1241     otherwise returns the character itself.
1242 */
1243
1244 /*!
1245     \overload
1246     Returns the title case equivalent of the UCS-4-encoded character specified
1247     by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1248     the character itself.
1249 */
1250 uint QChar::toTitleCase(uint ucs4)
1251 {
1252     if (ucs4 > LastValidCodePoint)
1253         return ucs4;
1254     return toTitleCase_helper<uint>(ucs4);
1255 }
1256
1257 static inline uint foldCase(const ushort *ch, const ushort *start)
1258 {
1259     uint c = *ch;
1260     if (QChar(c).isLowSurrogate() && ch > start && QChar(*(ch - 1)).isHighSurrogate())
1261         c = QChar::surrogateToUcs4(*(ch - 1), c);
1262     return toCaseFolded_helper<uint>(c);
1263 }
1264
1265 static inline uint foldCase(uint ch, uint &last)
1266 {
1267     uint c = ch;
1268     if (QChar(c).isLowSurrogate() && QChar(last).isHighSurrogate())
1269         c = QChar::surrogateToUcs4(last, c);
1270     last = ch;
1271     return toCaseFolded_helper<uint>(c);
1272 }
1273
1274 static inline ushort foldCase(ushort ch)
1275 {
1276     return toCaseFolded_helper<ushort>(ch);
1277 }
1278
1279 /*!
1280     \fn QChar QChar::toCaseFolded() const
1281
1282     Returns the case folded equivalent of the character.
1283     For most Unicode characters this is the same as toLowerCase().
1284 */
1285
1286 /*!
1287     \overload
1288     Returns the case folded equivalent of the UCS-4-encoded character specified
1289     by \a ucs4. For most Unicode characters this is the same as toLowerCase().
1290 */
1291 uint QChar::toCaseFolded(uint ucs4)
1292 {
1293     if (ucs4 > LastValidCodePoint)
1294         return ucs4;
1295     return toCaseFolded_helper<uint>(ucs4);
1296 }
1297
1298 /*!
1299     \fn char QChar::toLatin1() const
1300
1301     Returns the Latin-1 character equivalent to the QChar, or 0. This
1302     is mainly useful for non-internationalized software.
1303
1304     \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1305     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1306
1307     \sa unicode()
1308 */
1309
1310 /*!
1311     \fn QChar QChar::fromLatin1(char)
1312
1313     Converts the Latin-1 character \a c to its equivalent QChar. This
1314     is mainly useful for non-internationalized software.
1315
1316     An alternative is to use QLatin1Char.
1317
1318     \sa toLatin1(), unicode()
1319 */
1320
1321 /*!
1322     \fn char QChar::toAscii() const
1323     \deprecated
1324
1325     Returns the Latin-1 character value of the QChar, or 0 if the character is not
1326     representable.
1327
1328     The main purpose of this function is to preserve ASCII characters used
1329     in C strings. This is mainly useful for developers of non-internationalized
1330     software.
1331
1332     \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1333     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1334
1335     \note This function does not check whether the character value is inside
1336     the valid range of US-ASCII.
1337
1338     \sa toLatin1(), unicode()
1339 */
1340
1341 /*!
1342     \fn QChar QChar::fromAscii(char)
1343     \deprecated
1344
1345     Converts the ASCII character \a c to it's equivalent QChar. This
1346     is mainly useful for non-internationalized software.
1347
1348     An alternative is to use QLatin1Char.
1349
1350     \sa fromLatin1(), unicode()
1351 */
1352
1353 #ifndef QT_NO_DATASTREAM
1354 /*!
1355     \relates QChar
1356
1357     Writes the char \a chr to the stream \a out.
1358
1359     \sa {Serializing Qt Data Types}
1360 */
1361 QDataStream &operator<<(QDataStream &out, QChar chr)
1362 {
1363     out << quint16(chr.unicode());
1364     return out;
1365 }
1366
1367 /*!
1368     \relates QChar
1369
1370     Reads a char from the stream \a in into char \a chr.
1371
1372     \sa {Serializing Qt Data Types}
1373 */
1374 QDataStream &operator>>(QDataStream &in, QChar &chr)
1375 {
1376     quint16 u;
1377     in >> u;
1378     chr.unicode() = ushort(u);
1379     return in;
1380 }
1381 #endif // QT_NO_DATASTREAM
1382
1383 /*!
1384     \fn ushort & QChar::unicode()
1385
1386     Returns a reference to the numeric Unicode value of the QChar.
1387 */
1388
1389 /*!
1390     \fn ushort QChar::unicode() const
1391
1392     Returns the numeric Unicode value of the QChar.
1393 */
1394
1395 /*****************************************************************************
1396   Documentation of QChar related functions
1397  *****************************************************************************/
1398
1399 /*!
1400     \fn bool operator==(QChar c1, QChar c2)
1401
1402     \relates QChar
1403
1404     Returns true if \a c1 and \a c2 are the same Unicode character;
1405     otherwise returns false.
1406 */
1407
1408 /*!
1409     \fn int operator!=(QChar c1, QChar c2)
1410
1411     \relates QChar
1412
1413     Returns true if \a c1 and \a c2 are not the same Unicode
1414     character; otherwise returns false.
1415 */
1416
1417 /*!
1418     \fn int operator<=(QChar c1, QChar c2)
1419
1420     \relates QChar
1421
1422     Returns true if the numeric Unicode value of \a c1 is less than
1423     or equal to that of \a c2; otherwise returns false.
1424 */
1425
1426 /*!
1427     \fn int operator>=(QChar c1, QChar c2)
1428
1429     \relates QChar
1430
1431     Returns true if the numeric Unicode value of \a c1 is greater than
1432     or equal to that of \a c2; otherwise returns false.
1433 */
1434
1435 /*!
1436     \fn int operator<(QChar c1, QChar c2)
1437
1438     \relates QChar
1439
1440     Returns true if the numeric Unicode value of \a c1 is less than
1441     that of \a c2; otherwise returns false.
1442 */
1443
1444 /*!
1445     \fn int operator>(QChar c1, QChar c2)
1446
1447     \relates QChar
1448
1449     Returns true if the numeric Unicode value of \a c1 is greater than
1450     that of \a c2; otherwise returns false.
1451 */
1452
1453
1454 // ---------------------------------------------------------------------------
1455
1456
1457 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1458 {
1459     int length;
1460     int tag;
1461     unsigned short buffer[3];
1462
1463     QString &s = *str;
1464
1465     const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1466     const unsigned short *uc = utf16 + s.length();
1467     while (uc != utf16 + from) {
1468         uint ucs4 = *(--uc);
1469         if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1470             ushort high = *(uc - 1);
1471             if (QChar(high).isHighSurrogate()) {
1472                 --uc;
1473                 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1474             }
1475         }
1476
1477         if (QChar::unicodeVersion(ucs4) > version)
1478             continue;
1479
1480         const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1481         if (!d || (canonical && tag != QChar::Canonical))
1482             continue;
1483
1484         int pos = uc - utf16;
1485         s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1486         // since the replace invalidates the pointers and we do decomposition recursive
1487         utf16 = reinterpret_cast<unsigned short *>(s.data());
1488         uc = utf16 + pos + length;
1489     }
1490 }
1491
1492
1493 struct UCS2Pair {
1494     ushort u1;
1495     ushort u2;
1496 };
1497
1498 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1499 { return u1 < ligature.u1; }
1500 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1501 { return ligature.u1 < u1; }
1502
1503 struct UCS2SurrogatePair {
1504     UCS2Pair p1;
1505     UCS2Pair p2;
1506 };
1507
1508 inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1509 { return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); }
1510 inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1511 { return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; }
1512
1513 static uint inline ligatureHelper(uint u1, uint u2)
1514 {
1515     if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) {
1516         // compute Hangul syllable composition as per UAX #15
1517         // hangul L-V pair
1518         const uint LIndex = u1 - Hangul_LBase;
1519         if (LIndex < Hangul_LCount) {
1520             const uint VIndex = u2 - Hangul_VBase;
1521             if (VIndex < Hangul_VCount)
1522                 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1523         }
1524         // hangul LV-T pair
1525         const uint SIndex = u1 - Hangul_SBase;
1526         if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1527             const uint TIndex = u2 - Hangul_TBase;
1528             if (TIndex <= Hangul_TCount)
1529                 return u1 + TIndex;
1530         }
1531     }
1532
1533     const unsigned short index = GET_LIGATURE_INDEX(u2);
1534     if (index == 0xffff)
1535         return 0;
1536     const unsigned short *ligatures = uc_ligature_map+index;
1537     ushort length = *ligatures++;
1538     if (QChar::requiresSurrogates(u1)) {
1539         const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1540         const UCS2SurrogatePair *r = qBinaryFind(data, data + length, u1);
1541         if (r != data + length)
1542             return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2);
1543     } else {
1544         const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1545         const UCS2Pair *r = qBinaryFind(data, data + length, ushort(u1));
1546         if (r != data + length)
1547             return r->u2;
1548     }
1549
1550     return 0;
1551 }
1552
1553 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1554 {
1555     QString &s = *str;
1556
1557     if (from < 0 || s.length() - from < 2)
1558         return;
1559
1560     int starter = 0; // starter position
1561     uint stcode = 0; // starter code point
1562     int next = -1;
1563     int lastCombining = 0;
1564
1565     int pos = from;
1566     while (pos < s.length()) {
1567         int i = pos;
1568         uint uc = s.at(pos).unicode();
1569         if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1570             ushort low = s.at(pos+1).unicode();
1571             if (QChar(low).isLowSurrogate()) {
1572                 uc = QChar::surrogateToUcs4(uc, low);
1573                 ++pos;
1574             }
1575         }
1576
1577         const QUnicodeTables::Properties *p = qGetProp(uc);
1578         if (p->unicodeVersion > version) {
1579             starter = -1;
1580             next = -1; // to prevent i == next
1581             lastCombining = 255; // to prevent combining > lastCombining
1582             ++pos;
1583             continue;
1584         }
1585
1586         int combining = p->combiningClass;
1587         if (i == next || combining > lastCombining) {
1588             Q_ASSERT(starter >= from);
1589             // allowed to form ligature with S
1590             uint ligature = ligatureHelper(stcode, uc);
1591             if (ligature) {
1592                 stcode = ligature;
1593                 QChar *d = s.data();
1594                 // ligatureHelper() never changes planes
1595                 if (QChar::requiresSurrogates(ligature)) {
1596                     d[starter] = QChar::highSurrogate(ligature);
1597                     d[starter + 1] = QChar::lowSurrogate(ligature);
1598                     s.remove(i, 2);
1599                 } else {
1600                     d[starter] = ligature;
1601                     s.remove(i, 1);
1602                 }
1603                 continue;
1604             }
1605         }
1606         if (combining == 0) {
1607             starter = i;
1608             stcode = uc;
1609             next = pos + 1;
1610         }
1611         lastCombining = combining;
1612
1613         ++pos;
1614     }
1615 }
1616
1617
1618 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1619 {
1620     QString &s = *str;
1621     const int l = s.length()-1;
1622
1623     uint u1, u2;
1624     ushort c1, c2;
1625
1626     int pos = from;
1627     while (pos < l) {
1628         int p2 = pos+1;
1629         u1 = s.at(pos).unicode();
1630         if (QChar(u1).isHighSurrogate()) {
1631             ushort low = s.at(p2).unicode();
1632             if (QChar(low).isLowSurrogate()) {
1633                 u1 = QChar::surrogateToUcs4(u1, low);
1634                 if (p2 >= l)
1635                     break;
1636                 ++p2;
1637             }
1638         }
1639         c1 = 0;
1640
1641     advance:
1642         u2 = s.at(p2).unicode();
1643         if (QChar(u2).isHighSurrogate() && p2 < l) {
1644             ushort low = s.at(p2+1).unicode();
1645             if (QChar(low).isLowSurrogate()) {
1646                 u2 = QChar::surrogateToUcs4(u2, low);
1647                 ++p2;
1648             }
1649         }
1650
1651         c2 = 0;
1652         {
1653             const QUnicodeTables::Properties *p = qGetProp(u2);
1654             if (p->unicodeVersion <= version)
1655                 c2 = p->combiningClass;
1656         }
1657         if (c2 == 0) {
1658             pos = p2+1;
1659             continue;
1660         }
1661
1662         if (c1 == 0) {
1663             const QUnicodeTables::Properties *p = qGetProp(u1);
1664             if (p->unicodeVersion <= version)
1665                 c1 = p->combiningClass;
1666         }
1667
1668         if (c1 > c2) {
1669             QChar *uc = s.data();
1670             int p = pos;
1671             // exchange characters
1672             if (!QChar::requiresSurrogates(u2)) {
1673                 uc[p++] = u2;
1674             } else {
1675                 uc[p++] = QChar::highSurrogate(u2);
1676                 uc[p++] = QChar::lowSurrogate(u2);
1677             }
1678             if (!QChar::requiresSurrogates(u1)) {
1679                 uc[p++] = u1;
1680             } else {
1681                 uc[p++] = QChar::highSurrogate(u1);
1682                 uc[p++] = QChar::lowSurrogate(u1);
1683             }
1684             if (pos > 0)
1685                 --pos;
1686             if (pos > 0 && s.at(pos).isLowSurrogate())
1687                 --pos;
1688         } else {
1689             ++pos;
1690             if (QChar::requiresSurrogates(u1))
1691                 ++pos;
1692
1693             u1 = u2;
1694             c1 = c2; // != 0
1695             p2 = pos + 1;
1696             if (QChar::requiresSurrogates(u1))
1697                 ++p2;
1698             if (p2 > l)
1699                 break;
1700
1701             goto advance;
1702         }
1703     }
1704 }
1705
1706 QT_END_NAMESPACE