util/unicode/main.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the utils of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include <qlist.h>
  43 #include <qhash.h>
  44 #include <qfile.h>
  45 #include <qbytearray.h>
  46 #include <qstring.h>
  47 #include <qchar.h>
  48 #include <qvector.h>
  49 #include <qdebug.h>
  50 #if 0
  51 #include <private/qunicodetables_p.h>
  52 #endif
  53
  54 #define DATA_VERSION_S "6.1"
  55 #define DATA_VERSION_STR "QChar::Unicode_6_1"
  56
  57
  58 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
  59
  60 static void initAgeMap()
  61 {
  62     struct AgeMap {
  63         const QChar::UnicodeVersion version;
  64         const char *age;
  65     } ageMap[] = {
  66         { QChar::Unicode_1_1,   "1.1" },
  67         { QChar::Unicode_2_0,   "2.0" },
  68         { QChar::Unicode_2_1_2, "2.1" },
  69         { QChar::Unicode_3_0,   "3.0" },
  70         { QChar::Unicode_3_1,   "3.1" },
  71         { QChar::Unicode_3_2,   "3.2" },
  72         { QChar::Unicode_4_0,   "4.0" },
  73         { QChar::Unicode_4_1,   "4.1" },
  74         { QChar::Unicode_5_0,   "5.0" },
  75         { QChar::Unicode_5_1,   "5.1" },
  76         { QChar::Unicode_5_2,   "5.2" },
  77         { QChar::Unicode_6_0,   "6.0" },
  78         { QChar::Unicode_6_1,   "6.1" },
  79         { QChar::Unicode_Unassigned, 0 }
  80     };
  81     AgeMap *d = ageMap;
  82     while (d->age) {
  83         age_map.insert(d->age, d->version);
  84         ++d;
  85     }
  86 }
  87
  88
  89 static QHash<QByteArray, QChar::Category> categoryMap;
  90
  91 static void initCategoryMap()
  92 {
  93     struct Cat {
  94         QChar::Category cat;
  95         const char *name;
  96     } categories[] = {
  97         { QChar::Mark_NonSpacing,          "Mn" },
  98         { QChar::Mark_SpacingCombining,    "Mc" },
  99         { QChar::Mark_Enclosing,           "Me" },
 100
 101         { QChar::Number_DecimalDigit,      "Nd" },
 102         { QChar::Number_Letter,            "Nl" },
 103         { QChar::Number_Other,             "No" },
 104
 105         { QChar::Separator_Space,          "Zs" },
 106         { QChar::Separator_Line,           "Zl" },
 107         { QChar::Separator_Paragraph,      "Zp" },
 108
 109         { QChar::Other_Control,            "Cc" },
 110         { QChar::Other_Format,             "Cf" },
 111         { QChar::Other_Surrogate,          "Cs" },
 112         { QChar::Other_PrivateUse,         "Co" },
 113         { QChar::Other_NotAssigned,        "Cn" },
 114
 115         { QChar::Letter_Uppercase,         "Lu" },
 116         { QChar::Letter_Lowercase,         "Ll" },
 117         { QChar::Letter_Titlecase,         "Lt" },
 118         { QChar::Letter_Modifier,          "Lm" },
 119         { QChar::Letter_Other,             "Lo" },
 120
 121         { QChar::Punctuation_Connector,    "Pc" },
 122         { QChar::Punctuation_Dash,         "Pd" },
 123         { QChar::Punctuation_Open,         "Ps" },
 124         { QChar::Punctuation_Close,        "Pe" },
 125         { QChar::Punctuation_InitialQuote, "Pi" },
 126         { QChar::Punctuation_FinalQuote,   "Pf" },
 127         { QChar::Punctuation_Other,        "Po" },
 128
 129         { QChar::Symbol_Math,              "Sm" },
 130         { QChar::Symbol_Currency,          "Sc" },
 131         { QChar::Symbol_Modifier,          "Sk" },
 132         { QChar::Symbol_Other,             "So" },
 133         { QChar::Other_NotAssigned, 0 }
 134     };
 135     Cat *c = categories;
 136     while (c->name) {
 137         categoryMap.insert(c->name, c->cat);
 138         ++c;
 139     }
 140 }
 141
 142
 143 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
 144
 145 static void initDecompositionMap()
 146 {
 147     struct Dec {
 148         QChar::Decomposition dec;
 149         const char *name;
 150     } decompositions[] = {
 151         { QChar::Canonical, "<canonical>" },
 152         { QChar::Font, "<font>" },
 153         { QChar::NoBreak, "<noBreak>" },
 154         { QChar::Initial, "<initial>" },
 155         { QChar::Medial, "<medial>" },
 156         { QChar::Final, "<final>" },
 157         { QChar::Isolated, "<isolated>" },
 158         { QChar::Circle, "<circle>" },
 159         { QChar::Super, "<super>" },
 160         { QChar::Sub, "<sub>" },
 161         { QChar::Vertical, "<vertical>" },
 162         { QChar::Wide, "<wide>" },
 163         { QChar::Narrow, "<narrow>" },
 164         { QChar::Small, "<small>" },
 165         { QChar::Square, "<square>" },
 166         { QChar::Compat, "<compat>" },
 167         { QChar::Fraction, "<fraction>" },
 168         { QChar::NoDecomposition, 0 }
 169     };
 170     Dec *d = decompositions;
 171     while (d->name) {
 172         decompositionMap.insert(d->name, d->dec);
 173         ++d;
 174     }
 175 }
 176
 177
 178 static QHash<QByteArray, QChar::Direction> directionMap;
 179
 180 static void initDirectionMap()
 181 {
 182     struct Dir {
 183         QChar::Direction dir;
 184         const char *name;
 185     } directions[] = {
 186         { QChar::DirL, "L" },
 187         { QChar::DirR, "R" },
 188         { QChar::DirEN, "EN" },
 189         { QChar::DirES, "ES" },
 190         { QChar::DirET, "ET" },
 191         { QChar::DirAN, "AN" },
 192         { QChar::DirCS, "CS" },
 193         { QChar::DirB, "B" },
 194         { QChar::DirS, "S" },
 195         { QChar::DirWS, "WS" },
 196         { QChar::DirON, "ON" },
 197         { QChar::DirLRE, "LRE" },
 198         { QChar::DirLRO, "LRO" },
 199         { QChar::DirAL, "AL" },
 200         { QChar::DirRLE, "RLE" },
 201         { QChar::DirRLO, "RLO" },
 202         { QChar::DirPDF, "PDF" },
 203         { QChar::DirNSM, "NSM" },
 204         { QChar::DirBN, "BN" },
 205         { QChar::DirL, 0 }
 206     };
 207     Dir *d = directions;
 208     while (d->name) {
 209         directionMap.insert(d->name, d->dir);
 210         ++d;
 211     }
 212 }
 213
 214
 215 enum Joining {
 216     Joining_None,
 217     Joining_Left,
 218     Joining_Causing,
 219     Joining_Dual,
 220     Joining_Right,
 221     Joining_Transparent
 222
 223     , Joining_Unassigned
 224 };
 225
 226 static QHash<QByteArray, Joining> joining_map;
 227
 228 static void initJoiningMap()
 229 {
 230     struct JoiningList {
 231         Joining joining;
 232         const char *name;
 233     } joinings[] = {
 234         { Joining_None,        "U" },
 235         { Joining_Left,        "L" },
 236         { Joining_Causing,     "C" },
 237         { Joining_Dual,        "D" },
 238         { Joining_Right,       "R" },
 239         { Joining_Transparent, "T" },
 240         { Joining_Unassigned, 0 }
 241     };
 242     JoiningList *d = joinings;
 243     while (d->name) {
 244         joining_map.insert(d->name, d->joining);
 245         ++d;
 246     }
 247 }
 248
 249
 250 static const char *grapheme_break_string =
 251     "enum GraphemeBreak {\n"
 252     "    GraphemeBreakOther,\n"
 253     "    GraphemeBreakCR,\n"
 254     "    GraphemeBreakLF,\n"
 255     "    GraphemeBreakControl,\n"
 256     "    GraphemeBreakExtend,\n"
 257     "    GraphemeBreakPrepend,\n"
 258     "    GraphemeBreakSpacingMark,\n"
 259     "    GraphemeBreakL,\n"
 260     "    GraphemeBreakV,\n"
 261     "    GraphemeBreakT,\n"
 262     "    GraphemeBreakLV,\n"
 263     "    GraphemeBreakLVT\n"
 264     "};\n\n";
 265
 266 enum GraphemeBreak {
 267     GraphemeBreakOther,
 268     GraphemeBreakCR,
 269     GraphemeBreakLF,
 270     GraphemeBreakControl,
 271     GraphemeBreakExtend,
 272     GraphemeBreakPrepend,
 273     GraphemeBreakSpacingMark,
 274     GraphemeBreakL,
 275     GraphemeBreakV,
 276     GraphemeBreakT,
 277     GraphemeBreakLV,
 278     GraphemeBreakLVT
 279
 280     , GraphemeBreak_Unassigned
 281 };
 282
 283 static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
 284
 285 static void initGraphemeBreak()
 286 {
 287     struct GraphemeBreakList {
 288         GraphemeBreak brk;
 289         const char *name;
 290     } breaks[] = {
 291         { GraphemeBreakOther, "Other" },
 292         { GraphemeBreakCR, "CR" },
 293         { GraphemeBreakLF, "LF" },
 294         { GraphemeBreakControl, "Control" },
 295         { GraphemeBreakExtend, "Extend" },
 296         { GraphemeBreakPrepend, "Prepend" },
 297         { GraphemeBreakSpacingMark, "SpacingMark" },
 298         { GraphemeBreakL, "L" },
 299         { GraphemeBreakV, "V" },
 300         { GraphemeBreakT, "T" },
 301         { GraphemeBreakLV, "LV" },
 302         { GraphemeBreakLVT, "LVT" },
 303         { GraphemeBreak_Unassigned, 0 }
 304     };
 305     GraphemeBreakList *d = breaks;
 306     while (d->name) {
 307         grapheme_break_map.insert(d->name, d->brk);
 308         ++d;
 309     }
 310 }
 311
 312
 313 static const char *word_break_string =
 314     "enum WordBreak {\n"
 315     "    WordBreakOther,\n"
 316     "    WordBreakCR,\n"
 317     "    WordBreakLF,\n"
 318     "    WordBreakNewline,\n"
 319     "    WordBreakFormat,\n"
 320     "    WordBreakKatakana,\n"
 321     "    WordBreakALetter,\n"
 322     "    WordBreakMidNumLet,\n"
 323     "    WordBreakMidLetter,\n"
 324     "    WordBreakMidNum,\n"
 325     "    WordBreakNumeric,\n"
 326     "    WordBreakExtendNumLet\n"
 327     "};\n\n";
 328
 329 enum WordBreak {
 330     WordBreakOther,
 331     WordBreakCR,
 332     WordBreakLF,
 333     WordBreakNewline,
 334     WordBreakFormat,
 335     WordBreakKatakana,
 336     WordBreakALetter,
 337     WordBreakMidNumLet,
 338     WordBreakMidLetter,
 339     WordBreakMidNum,
 340     WordBreakNumeric,
 341     WordBreakExtendNumLet
 342
 343     , WordBreak_Unassigned
 344 };
 345
 346 static QHash<QByteArray, WordBreak> word_break_map;
 347
 348 static void initWordBreak()
 349 {
 350     struct WordBreakList {
 351         WordBreak brk;
 352         const char *name;
 353     } breaks[] = {
 354         { WordBreakOther, "Other" },
 355         { WordBreakCR, "CR" },
 356         { WordBreakLF, "LF" },
 357         { WordBreakNewline, "Newline" },
 358         { WordBreakFormat, "Extend" },
 359         { WordBreakFormat, "Format" },
 360         { WordBreakKatakana, "Katakana" },
 361         { WordBreakALetter, "ALetter" },
 362         { WordBreakMidNumLet, "MidNumLet" },
 363         { WordBreakMidLetter, "MidLetter" },
 364         { WordBreakMidNum, "MidNum" },
 365         { WordBreakNumeric, "Numeric" },
 366         { WordBreakExtendNumLet, "ExtendNumLet" },
 367         { WordBreak_Unassigned, 0 }
 368     };
 369     WordBreakList *d = breaks;
 370     while (d->name) {
 371         word_break_map.insert(d->name, d->brk);
 372         ++d;
 373     }
 374 }
 375
 376
 377 static const char *sentence_break_string =
 378     "enum SentenceBreak {\n"
 379     "    SentenceBreakOther,\n"
 380     "    SentenceBreakCR,\n"
 381     "    SentenceBreakLF,\n"
 382     "    SentenceBreakSep,\n"
 383     "    SentenceBreakFormat,\n"
 384     "    SentenceBreakSp,\n"
 385     "    SentenceBreakLower,\n"
 386     "    SentenceBreakUpper,\n"
 387     "    SentenceBreakOLetter,\n"
 388     "    SentenceBreakNumeric,\n"
 389     "    SentenceBreakATerm,\n"
 390     "    SentenceBreakSContinue,\n"
 391     "    SentenceBreakSTerm,\n"
 392     "    SentenceBreakClose\n"
 393     "};\n\n";
 394
 395 enum SentenceBreak {
 396     SentenceBreakOther,
 397     SentenceBreakCR,
 398     SentenceBreakLF,
 399     SentenceBreakSep,
 400     SentenceBreakFormat,
 401     SentenceBreakSp,
 402     SentenceBreakLower,
 403     SentenceBreakUpper,
 404     SentenceBreakOLetter,
 405     SentenceBreakNumeric,
 406     SentenceBreakATerm,
 407     SentenceBreakSContinue,
 408     SentenceBreakSTerm,
 409     SentenceBreakClose
 410
 411     , SentenceBreak_Unassigned
 412 };
 413
 414 static QHash<QByteArray, SentenceBreak> sentence_break_map;
 415
 416 static void initSentenceBreak()
 417 {
 418     struct SentenceBreakList {
 419         SentenceBreak brk;
 420         const char *name;
 421     } breaks[] = {
 422         { SentenceBreakOther, "Other" },
 423         { SentenceBreakCR, "CR" },
 424         { SentenceBreakLF, "LF" },
 425         { SentenceBreakSep, "Sep" },
 426         { SentenceBreakFormat, "Extend" },
 427         { SentenceBreakFormat, "Format" },
 428         { SentenceBreakSp, "Sp" },
 429         { SentenceBreakLower, "Lower" },
 430         { SentenceBreakUpper, "Upper" },
 431         { SentenceBreakOLetter, "OLetter" },
 432         { SentenceBreakNumeric, "Numeric" },
 433         { SentenceBreakATerm, "ATerm" },
 434         { SentenceBreakSContinue, "SContinue" },
 435         { SentenceBreakSTerm, "STerm" },
 436         { SentenceBreakClose, "Close" },
 437         { SentenceBreak_Unassigned, 0 }
 438     };
 439     SentenceBreakList *d = breaks;
 440     while (d->name) {
 441         sentence_break_map.insert(d->name, d->brk);
 442         ++d;
 443     }
 444 }
 445
 446
 447 static const char *line_break_class_string =
 448     "// see http://www.unicode.org/reports/tr14/tr14-28.html\n"
 449     "// we don't use the XX and AI classes and map them to AL instead.\n"
 450     "enum LineBreakClass {\n"
 451     "    LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n"
 452     "    LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n"
 453     "    LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
 454     "    LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
 455     "    LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
 456     "    LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_CB, LineBreak_SA,\n"
 457     "    LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
 458     "};\n\n";
 459
 460 enum LineBreakClass {
 461     LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,
 462     LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,
 463     LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
 464     LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
 465     LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
 466     LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_CB, LineBreak_SA,
 467     LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
 468
 469     , LineBreak_Unassigned
 470 };
 471
 472 static QHash<QByteArray, LineBreakClass> line_break_map;
 473
 474 static void initLineBreak()
 475 {
 476     // ### Classes XX and AI are left out and mapped to AL for now.
 477     // ### Class NL is mapped to BK.
 478     // ### Treating characters of class CJ as class NS will give CSS strict line breaking;
 479     //     treating them as class ID will give CSS normal breaking.
 480     struct LineBreakList {
 481         LineBreakClass brk;
 482         const char *name;
 483     } breaks[] = {
 484         { LineBreak_BK, "BK" },
 485         { LineBreak_CR, "CR" },
 486         { LineBreak_LF, "LF" },
 487         { LineBreak_CM, "CM" },
 488         { LineBreak_BK, "NL" },
 489         { LineBreak_SG, "SG" },
 490         { LineBreak_WJ, "WJ" },
 491         { LineBreak_ZW, "ZW" },
 492         { LineBreak_GL, "GL" },
 493         { LineBreak_SP, "SP" },
 494         { LineBreak_B2, "B2" },
 495         { LineBreak_BA, "BA" },
 496         { LineBreak_BB, "BB" },
 497         { LineBreak_HY, "HY" },
 498         { LineBreak_CB, "CB" },
 499         { LineBreak_NS, "CJ" },
 500         { LineBreak_CL, "CL" },
 501         { LineBreak_CP, "CP" },
 502         { LineBreak_EX, "EX" },
 503         { LineBreak_IN, "IN" },
 504         { LineBreak_NS, "NS" },
 505         { LineBreak_OP, "OP" },
 506         { LineBreak_QU, "QU" },
 507         { LineBreak_IS, "IS" },
 508         { LineBreak_NU, "NU" },
 509         { LineBreak_PO, "PO" },
 510         { LineBreak_PR, "PR" },
 511         { LineBreak_SY, "SY" },
 512         { LineBreak_AL, "AI" },
 513         { LineBreak_AL, "AL" },
 514         { LineBreak_HL, "HL" },
 515         { LineBreak_H2, "H2" },
 516         { LineBreak_H3, "H3" },
 517         { LineBreak_ID, "ID" },
 518         { LineBreak_JL, "JL" },
 519         { LineBreak_JV, "JV" },
 520         { LineBreak_JT, "JT" },
 521         { LineBreak_SA, "SA" },
 522         { LineBreak_AL, "XX" },
 523         { LineBreak_Unassigned, 0 }
 524     };
 525     LineBreakList *d = breaks;
 526     while (d->name) {
 527         line_break_map.insert(d->name, d->brk);
 528         ++d;
 529     }
 530 }
 531
 532
 533 // Keep this one in sync with the code in createPropertyInfo
 534 static const char *property_string =
 535     "struct Properties {\n"
 536     "    ushort category            : 8; /* 5 used */\n"
 537     "    ushort direction           : 8; /* 5 used */\n"
 538     "    ushort combiningClass      : 8;\n"
 539     "    ushort joining             : 2;\n"
 540     "    signed short digitValue    : 6; /* 5 used */\n"
 541     "    signed short mirrorDiff    : 16;\n"
 542     "    signed short lowerCaseDiff : 16;\n"
 543     "    signed short upperCaseDiff : 16;\n"
 544     "    signed short titleCaseDiff : 16;\n"
 545     "    signed short caseFoldDiff  : 16;\n"
 546     "    ushort lowerCaseSpecial    : 1;\n"
 547     "    ushort upperCaseSpecial    : 1;\n"
 548     "    ushort titleCaseSpecial    : 1;\n"
 549     "    ushort caseFoldSpecial     : 1;\n"
 550     "    ushort unicodeVersion      : 4;\n"
 551     "    ushort graphemeBreak       : 8; /* 4 used */\n"
 552     "    ushort wordBreak           : 8; /* 4 used */\n"
 553     "    ushort sentenceBreak       : 8; /* 4 used */\n"
 554     "    ushort line_break_class    : 8; /* 6 used */\n"
 555     "    ushort script              : 8; /* 5 used */\n"
 556     "};\n\n"
 557     "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
 558     "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"
 559     "\n";
 560
 561 static const char *methods =
 562     "Q_CORE_EXPORT GraphemeBreak QT_FASTCALL graphemeBreakClass(uint ucs4);\n"
 563     "inline GraphemeBreak graphemeBreakClass(QChar ch)\n"
 564     "{ return graphemeBreakClass(ch.unicode()); }\n"
 565     "\n"
 566     "Q_CORE_EXPORT WordBreak QT_FASTCALL wordBreakClass(uint ucs4);\n"
 567     "inline WordBreak wordBreakClass(QChar ch)\n"
 568     "{ return wordBreakClass(ch.unicode()); }\n"
 569     "\n"
 570     "Q_CORE_EXPORT SentenceBreak QT_FASTCALL sentenceBreakClass(uint ucs4);\n"
 571     "inline SentenceBreak sentenceBreakClass(QChar ch)\n"
 572     "{ return sentenceBreakClass(ch.unicode()); }\n"
 573     "\n"
 574     "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
 575     "inline LineBreakClass lineBreakClass(QChar ch)\n"
 576     "{ return lineBreakClass(ch.unicode()); }\n"
 577     "\n"
 578     "Q_CORE_EXPORT Script QT_FASTCALL script(uint ucs4);\n"
 579     "inline Script script(QChar ch)\n"
 580     "{ return script(ch.unicode()); }\n"
 581     "\n";
 582
 583 static const int SizeOfPropertiesStruct = 20;
 584
 585 struct PropertyFlags {
 586     bool operator ==(const PropertyFlags &o) {
 587         return (combiningClass == o.combiningClass
 588                 && category == o.category
 589                 && direction == o.direction
 590                 && joining == o.joining
 591                 && age == o.age
 592                 && digitValue == o.digitValue
 593                 && mirrorDiff == o.mirrorDiff
 594                 && lowerCaseDiff == o.lowerCaseDiff
 595                 && upperCaseDiff == o.upperCaseDiff
 596                 && titleCaseDiff == o.titleCaseDiff
 597                 && caseFoldDiff == o.caseFoldDiff
 598                 && lowerCaseSpecial == o.lowerCaseSpecial
 599                 && upperCaseSpecial == o.upperCaseSpecial
 600                 && titleCaseSpecial == o.titleCaseSpecial
 601                 && caseFoldSpecial == o.caseFoldSpecial
 602                 && graphemeBreak == o.graphemeBreak
 603                 && wordBreak == o.wordBreak
 604                 && sentenceBreak == o.sentenceBreak
 605                 && line_break_class == o.line_break_class
 606                 && script == o.script
 607             );
 608     }
 609     // from UnicodeData.txt
 610     uchar combiningClass : 8;
 611     QChar::Category category : 5;
 612     QChar::Direction direction : 5;
 613     // from ArabicShaping.txt
 614     QChar::Joining joining : 2;
 615     // from DerivedAge.txt
 616     QChar::UnicodeVersion age : 4;
 617     int digitValue;
 618
 619     int mirrorDiff : 16;
 620
 621     int lowerCaseDiff;
 622     int upperCaseDiff;
 623     int titleCaseDiff;
 624     int caseFoldDiff;
 625     bool lowerCaseSpecial;
 626     bool upperCaseSpecial;
 627     bool titleCaseSpecial;
 628     bool caseFoldSpecial;
 629     GraphemeBreak graphemeBreak;
 630     WordBreak wordBreak;
 631     SentenceBreak sentenceBreak;
 632     LineBreakClass line_break_class;
 633     int script;
 634 };
 635
 636
 637 static QList<int> specialCaseMap;
 638
 639 static int appendToSpecialCaseMap(const QList<int> &map)
 640 {
 641     QList<int> utf16map;
 642     for (int i = 0; i < map.size(); ++i) {
 643         int val = map.at(i);
 644         if (QChar::requiresSurrogates(val)) {
 645             utf16map << QChar::highSurrogate(val);
 646             utf16map << QChar::lowSurrogate(val);
 647         } else {
 648             utf16map << val;
 649         }
 650     }
 651     int length = utf16map.size();
 652     utf16map.prepend(length);
 653
 654     if (specialCaseMap.isEmpty())
 655         specialCaseMap << 0; // placeholder
 656
 657     int i = 1;
 658     while (i < specialCaseMap.size()) {
 659         int n = specialCaseMap.at(i);
 660         if (n == length) {
 661             int j;
 662             for (j = 1; j <= n; ++j) {
 663                 if (specialCaseMap.at(i+j) != utf16map.at(j))
 664                     break;
 665             }
 666             if (j > n)
 667                 return i;
 668         }
 669         i += n + 1;
 670     }
 671
 672     int pos = specialCaseMap.size();
 673     specialCaseMap << utf16map;
 674     return pos;
 675 }
 676
 677 struct UnicodeData {
 678     UnicodeData(int codepoint = 0) {
 679         p.category = QChar::Other_NotAssigned; // Cn
 680         p.combiningClass = 0;
 681
 682         p.direction = QChar::DirL;
 683         // DerivedBidiClass.txt
 684         // The unassigned code points that default to AL are in the ranges:
 685         //     [U+0600..U+07BF, U+08A0..U+08FF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFF, U+1EE00..U+1EEFF]
 686         if ((codepoint >= 0x0600 && codepoint <= 0x07BF)
 687             || (codepoint >= 0x08A0 && codepoint <= 0x08FF)
 688             || (codepoint >= 0xFB50 && codepoint <= 0xFDCF)
 689             || (codepoint >= 0xFDF0 && codepoint <= 0xFDFF)
 690             || (codepoint >= 0xFE70 && codepoint <= 0xFEFF)
 691             || (codepoint >= 0x1EE00 && codepoint <= 0x1EEFF)) {
 692             p.direction = QChar::DirAL;
 693         }
 694         // The unassigned code points that default to R are in the ranges:
 695         //     [U+0590..U+05FF, U+07C0..U+089F, U+FB1D..U+FB4F, U+10800..U+10FFF, U+1E800..U+1EDFF, U+1EF00..U+1EFFF]
 696         else if ((codepoint >= 0x0590 && codepoint <= 0x05FF)
 697             || (codepoint >= 0x07C0 && codepoint <= 0x089F)
 698             || (codepoint >= 0xFB1D && codepoint <= 0xFB4F)
 699             || (codepoint >= 0x10800 && codepoint <= 0x10FFF)
 700             || (codepoint >= 0x1E800 && codepoint <= 0x1EDFF)
 701             || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
 702             p.direction = QChar::DirR;
 703         }
 704
 705         p.line_break_class = LineBreak_AL; // XX -> AL
 706         // LineBreak.txt
 707         // The unassigned code points that default to "ID" include ranges in the following blocks:
 708         //     [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2F800..U+2FA1F, U+20000..U+2FFFD, U+30000..U+3FFFD]
 709         if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
 710             || (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
 711             || (codepoint >= 0xF900 && codepoint <= 0xFAFF)
 712             || (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
 713             || (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
 714             || (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
 715             || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
 716             || (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
 717             || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
 718             p.line_break_class = LineBreak_ID;
 719         }
 720
 721         mirroredChar = 0;
 722         decompositionType = QChar::NoDecomposition;
 723         p.joining = QChar::OtherJoining;
 724         p.age = QChar::Unicode_Unassigned;
 725         p.mirrorDiff = 0;
 726         p.digitValue = -1;
 727         p.lowerCaseDiff = 0;
 728         p.upperCaseDiff = 0;
 729         p.titleCaseDiff = 0;
 730         p.caseFoldDiff = 0;
 731         p.lowerCaseSpecial = 0;
 732         p.upperCaseSpecial = 0;
 733         p.titleCaseSpecial = 0;
 734         p.caseFoldSpecial = 0;
 735         p.graphemeBreak = GraphemeBreakOther;
 736         p.wordBreak = WordBreakOther;
 737         p.sentenceBreak = SentenceBreakOther;
 738         p.script = 0; // Common
 739         propertyIndex = -1;
 740         excludedComposition = false;
 741     }
 742
 743     static UnicodeData &valueRef(int codepoint);
 744
 745     PropertyFlags p;
 746
 747     // from UnicodeData.txt
 748     QChar::Decomposition decompositionType;
 749     QList<int> decomposition;
 750
 751     QList<int> specialFolding;
 752
 753     // from BidiMirroring.txt
 754     int mirroredChar;
 755
 756     // DerivedNormalizationProps.txt
 757     bool excludedComposition;
 758
 759     // computed position of unicode property set
 760     int propertyIndex;
 761 };
 762
 763 static QList<UnicodeData> unicodeData;
 764
 765 UnicodeData &UnicodeData::valueRef(int codepoint)
 766 {
 767     static bool initialized = false;
 768     if (!initialized) {
 769         unicodeData.reserve(QChar::LastValidCodePoint + 1);
 770         for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc)
 771             unicodeData.append(UnicodeData(uc));
 772         initialized = true;
 773     }
 774
 775     Q_ASSERT(codepoint <= 0x10ffff);
 776     return unicodeData[codepoint];
 777 }
 778
 779
 780 static QHash<int, int> decompositionLength;
 781 static int highestComposedCharacter = 0;
 782 static int numLigatures = 0;
 783 static int highestLigature = 0;
 784
 785 struct Ligature {
 786     int u1;
 787     int u2;
 788     int ligature;
 789 };
 790 // we need them sorted after the first component for fast lookup
 791 bool operator < (const Ligature &l1, const Ligature &l2)
 792 { return l1.u1 < l2.u1; }
 793
 794 static QHash<int, QList<Ligature> > ligatureHashes;
 795
 796 static QHash<int, int> combiningClassUsage;
 797
 798 static int maxLowerCaseDiff = 0;
 799 static int maxUpperCaseDiff = 0;
 800 static int maxTitleCaseDiff = 0;
 801
 802 static void readUnicodeData()
 803 {
 804     qDebug("Reading UnicodeData.txt");
 805
 806     enum UniDataFields {
 807         UD_Value,
 808         UD_Name,
 809         UD_Category,
 810         UD_CombiningClass,
 811         UD_BidiCategory,
 812         UD_Decomposition,
 813         UD_DecimalDigitValue,
 814         UD_DigitValue,
 815         UD_NumericValue,
 816         UD_Mirrored,
 817         UD_OldName,
 818         UD_Comment,
 819         UD_UpperCase,
 820         UD_LowerCase,
 821         UD_TitleCase
 822     };
 823
 824     QFile f("data/UnicodeData.txt");
 825     if (!f.exists())
 826         qFatal("Couldn't find UnicodeData.txt");
 827
 828     f.open(QFile::ReadOnly);
 829
 830     while (!f.atEnd()) {
 831         QByteArray line;
 832         line.resize(1024);
 833         int len = f.readLine(line.data(), 1024);
 834         line.truncate(len-1);
 835
 836         int comment = line.indexOf('#');
 837         if (comment >= 0)
 838             line = line.left(comment);
 839         if (line.isEmpty())
 840             continue;
 841
 842         QList<QByteArray> properties = line.split(';');
 843         bool ok;
 844         int codepoint = properties[UD_Value].toInt(&ok, 16);
 845         Q_ASSERT(ok);
 846         Q_ASSERT(codepoint <= QChar::LastValidCodePoint);
 847         int lastCodepoint = codepoint;
 848
 849         QByteArray name = properties[UD_Name];
 850         if (name.startsWith('<') && name.contains("First")) {
 851             QByteArray nextLine;
 852             nextLine.resize(1024);
 853             f.readLine(nextLine.data(), 1024);
 854             QList<QByteArray> properties = nextLine.split(';');
 855             Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
 856             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
 857             Q_ASSERT(ok);
 858             Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint);
 859         }
 860
 861         UnicodeData &data = UnicodeData::valueRef(codepoint);
 862         data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
 863         data.p.combiningClass = properties[UD_CombiningClass].toInt();
 864         if (!combiningClassUsage.contains(data.p.combiningClass))
 865             combiningClassUsage[data.p.combiningClass] = 1;
 866         else
 867             ++combiningClassUsage[data.p.combiningClass];
 868
 869         data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
 870
 871         if (!properties[UD_UpperCase].isEmpty()) {
 872             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
 873             Q_ASSERT(ok);
 874             int diff = upperCase - codepoint;
 875             if (qAbs(diff) >= (1<<14)) {
 876                 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
 877                 // if the condition below doesn't hold anymore we need to modify our special upper casing code in qchar.cpp
 878                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(upperCase));
 879                 data.p.upperCaseSpecial = true;
 880                 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
 881             } else {
 882                 data.p.upperCaseDiff = diff;
 883                 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
 884             }
 885             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(upperCase)) {
 886                 // if the conditions below doesn't hold anymore we need to modify our upper casing code
 887                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
 888                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
 889             }
 890         }
 891         if (!properties[UD_LowerCase].isEmpty()) {
 892             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
 893             Q_ASSERT(ok);
 894             int diff = lowerCase - codepoint;
 895             if (qAbs(diff) >= (1<<14)) {
 896                 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
 897                 // if the condition below doesn't hold anymore we need to modify our special lower casing code in qchar.cpp
 898                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(lowerCase));
 899                 data.p.lowerCaseSpecial = true;
 900                 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
 901             } else {
 902                 data.p.lowerCaseDiff = diff;
 903                 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
 904             }
 905             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(lowerCase)) {
 906                 // if the conditions below doesn't hold anymore we need to modify our lower casing code
 907                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
 908                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
 909             }
 910         }
 911         // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
 912         if (properties[UD_TitleCase].isEmpty())
 913             properties[UD_TitleCase] = properties[UD_UpperCase];
 914         if (!properties[UD_TitleCase].isEmpty()) {
 915             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
 916             Q_ASSERT(ok);
 917             int diff = titleCase - codepoint;
 918             if (qAbs(diff) >= (1<<14)) {
 919                 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
 920                 // if the condition below doesn't hold anymore we need to modify our special title casing code in qchar.cpp
 921                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(titleCase));
 922                 data.p.titleCaseSpecial = true;
 923                 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
 924             } else {
 925                 data.p.titleCaseDiff = diff;
 926                 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
 927             }
 928             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(titleCase)) {
 929                 // if the conditions below doesn't hold anymore we need to modify our title casing code
 930                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
 931                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
 932             }
 933         }
 934
 935         if (!properties[UD_DigitValue].isEmpty())
 936             data.p.digitValue = properties[UD_DigitValue].toInt();
 937
 938         // decompositition
 939         QByteArray decomposition = properties[UD_Decomposition];
 940         if (!decomposition.isEmpty()) {
 941             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
 942             QList<QByteArray> d = decomposition.split(' ');
 943             if (d[0].contains('<')) {
 944                 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
 945                 if (data.decompositionType == QChar::NoDecomposition)
 946                     qFatal("unassigned decomposition type: %s", d[0].constData());
 947                 d.takeFirst();
 948             } else {
 949                 data.decompositionType = QChar::Canonical;
 950             }
 951             for (int i = 0; i < d.size(); ++i) {
 952                 data.decomposition.append(d[i].toInt(&ok, 16));
 953                 Q_ASSERT(ok);
 954             }
 955             ++decompositionLength[data.decomposition.size()];
 956         }
 957
 958         for (int i = codepoint; i <= lastCodepoint; ++i)
 959             unicodeData[i] = data;
 960     }
 961 }
 962
 963 static int maxMirroredDiff = 0;
 964
 965 static void readBidiMirroring()
 966 {
 967     qDebug("Reading BidiMirroring.txt");
 968
 969     QFile f("data/BidiMirroring.txt");
 970     if (!f.exists())
 971         qFatal("Couldn't find BidiMirroring.txt");
 972
 973     f.open(QFile::ReadOnly);
 974
 975     while (!f.atEnd()) {
 976         QByteArray line;
 977         line.resize(1024);
 978         int len = f.readLine(line.data(), 1024);
 979         line.resize(len-1);
 980
 981         int comment = line.indexOf('#');
 982         if (comment >= 0)
 983             line = line.left(comment);
 984
 985         if (line.isEmpty())
 986             continue;
 987         line = line.replace(" ", "");
 988
 989         QList<QByteArray> pair = line.split(';');
 990         Q_ASSERT(pair.size() == 2);
 991
 992         bool ok;
 993         int codepoint = pair[0].toInt(&ok, 16);
 994         Q_ASSERT(ok);
 995         int mirror = pair[1].toInt(&ok, 16);
 996         Q_ASSERT(ok);
 997
 998         UnicodeData &d = UnicodeData::valueRef(codepoint);
 999         d.mirroredChar = mirror;
1000         d.p.mirrorDiff = d.mirroredChar - codepoint;
1001         maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
1002     }
1003 }
1004
1005 static void readArabicShaping()
1006 {
1007     qDebug("Reading ArabicShaping.txt");
1008
1009     QFile f("data/ArabicShaping.txt");
1010     if (!f.exists())
1011         qFatal("Couldn't find ArabicShaping.txt");
1012
1013     f.open(QFile::ReadOnly);
1014
1015     while (!f.atEnd()) {
1016         QByteArray line;
1017         line.resize(1024);
1018         int len = f.readLine(line.data(), 1024);
1019         line.resize(len-1);
1020
1021         int comment = line.indexOf('#');
1022         if (comment >= 0)
1023             line = line.left(comment);
1024         line = line.trimmed();
1025
1026         if (line.isEmpty())
1027             continue;
1028
1029         QList<QByteArray> l = line.split(';');
1030         Q_ASSERT(l.size() == 4);
1031
1032         bool ok;
1033         int codepoint = l[0].toInt(&ok, 16);
1034         Q_ASSERT(ok);
1035
1036         Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
1037         if (joining == Joining_Unassigned)
1038             qFatal("unassigned or unhandled joining value: %s", l[2].constData());
1039
1040         if (joining == Joining_Left) {
1041             // There are currently no characters of joining type Left_Joining defined in Unicode.
1042             qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
1043         }
1044
1045         UnicodeData &d = UnicodeData::valueRef(codepoint);
1046         if (joining == Joining_Right)
1047             d.p.joining = QChar::Right;
1048         else if (joining == Joining_Dual)
1049             d.p.joining = QChar::Dual;
1050         else if (joining == Joining_Causing)
1051             d.p.joining = QChar::Center;
1052         else
1053             d.p.joining = QChar::OtherJoining;
1054     }
1055 }
1056
1057 static void readDerivedAge()
1058 {
1059     qDebug("Reading DerivedAge.txt");
1060
1061     QFile f("data/DerivedAge.txt");
1062     if (!f.exists())
1063         qFatal("Couldn't find DerivedAge.txt");
1064
1065     f.open(QFile::ReadOnly);
1066
1067     while (!f.atEnd()) {
1068         QByteArray line;
1069         line.resize(1024);
1070         int len = f.readLine(line.data(), 1024);
1071         line.resize(len-1);
1072
1073         int comment = line.indexOf('#');
1074         if (comment >= 0)
1075             line = line.left(comment);
1076         line.replace(" ", "");
1077
1078         if (line.isEmpty())
1079             continue;
1080
1081         QList<QByteArray> l = line.split(';');
1082         Q_ASSERT(l.size() == 2);
1083
1084         QByteArray codes = l[0];
1085         codes.replace("..", ".");
1086         QList<QByteArray> cl = codes.split('.');
1087
1088         bool ok;
1089         int from = cl[0].toInt(&ok, 16);
1090         Q_ASSERT(ok);
1091         int to = from;
1092         if (cl.size() == 2) {
1093             to = cl[1].toInt(&ok, 16);
1094             Q_ASSERT(ok);
1095         }
1096
1097         QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1098         //qDebug() << hex << from << ".." << to << ba << age;
1099         if (age == QChar::Unicode_Unassigned)
1100             qFatal("unassigned or unhandled age value: %s", l[1].constData());
1101
1102         for (int codepoint = from; codepoint <= to; ++codepoint) {
1103             UnicodeData &d = UnicodeData::valueRef(codepoint);
1104             d.p.age = age;
1105         }
1106     }
1107 }
1108
1109 static void readDerivedNormalizationProps()
1110 {
1111     qDebug("Reading DerivedNormalizationProps.txt");
1112
1113     QFile f("data/DerivedNormalizationProps.txt");
1114     if (!f.exists())
1115         qFatal("Couldn't find DerivedNormalizationProps.txt");
1116
1117     f.open(QFile::ReadOnly);
1118
1119     while (!f.atEnd()) {
1120         QByteArray line;
1121         line.resize(1024);
1122         int len = f.readLine(line.data(), 1024);
1123         line.resize(len-1);
1124
1125         int comment = line.indexOf('#');
1126         if (comment >= 0)
1127             line = line.left(comment);
1128
1129         if (line.trimmed().isEmpty())
1130             continue;
1131
1132         QList<QByteArray> l = line.split(';');
1133         Q_ASSERT(l.size() >= 2);
1134
1135         QByteArray propName = l[1].trimmed();
1136         if (propName != "Full_Composition_Exclusion")
1137             // ###
1138             continue;
1139
1140         QByteArray codes = l[0].trimmed();
1141         codes.replace("..", ".");
1142         QList<QByteArray> cl = codes.split('.');
1143
1144         bool ok;
1145         int from = cl[0].toInt(&ok, 16);
1146         Q_ASSERT(ok);
1147         int to = from;
1148         if (cl.size() == 2) {
1149             to = cl[1].toInt(&ok, 16);
1150             Q_ASSERT(ok);
1151         }
1152
1153         for (int codepoint = from; codepoint <= to; ++codepoint) {
1154             UnicodeData &d = UnicodeData::valueRef(codepoint);
1155             d.excludedComposition = true;
1156         }
1157     }
1158
1159     for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1160         UnicodeData &d = UnicodeData::valueRef(codepoint);
1161         if (!d.excludedComposition
1162             && d.decompositionType == QChar::Canonical
1163             && d.decomposition.size() > 1) {
1164             Q_ASSERT(d.decomposition.size() == 2);
1165
1166             int part1 = d.decomposition.at(0);
1167             int part2 = d.decomposition.at(1);
1168
1169             // all non-starters are listed in DerivedNormalizationProps.txt
1170             // and already excluded from composition
1171             Q_ASSERT(UnicodeData::valueRef(part1).p.combiningClass == 0);
1172
1173             ++numLigatures;
1174             highestLigature = qMax(highestLigature, part1);
1175             Ligature l = { part1, part2, codepoint };
1176             ligatureHashes[part2].append(l);
1177         }
1178     }
1179 }
1180
1181
1182 struct NormalizationCorrection {
1183     uint codepoint;
1184     uint mapped;
1185     int version;
1186 };
1187
1188 static QByteArray createNormalizationCorrections()
1189 {
1190     qDebug("Reading NormalizationCorrections.txt");
1191
1192     QFile f("data/NormalizationCorrections.txt");
1193     if (!f.exists())
1194         qFatal("Couldn't find NormalizationCorrections.txt");
1195
1196     f.open(QFile::ReadOnly);
1197
1198     QByteArray out;
1199
1200     out += "struct NormalizationCorrection {\n"
1201            "    uint ucs4;\n"
1202            "    uint old_mapping;\n"
1203            "    int version;\n"
1204            "};\n\n"
1205
1206            "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1207
1208     int maxVersion = 0;
1209     int numCorrections = 0;
1210     while (!f.atEnd()) {
1211         QByteArray line;
1212         line.resize(1024);
1213         int len = f.readLine(line.data(), 1024);
1214         line.resize(len-1);
1215
1216         int comment = line.indexOf('#');
1217         if (comment >= 0)
1218             line = line.left(comment);
1219         line.replace(" ", "");
1220
1221         if (line.isEmpty())
1222             continue;
1223
1224         Q_ASSERT(!line.contains(".."));
1225
1226         QList<QByteArray> fields = line.split(';');
1227         Q_ASSERT(fields.size() == 4);
1228
1229         NormalizationCorrection c = { 0, 0, 0 };
1230         bool ok;
1231         c.codepoint = fields.at(0).toInt(&ok, 16);
1232         Q_ASSERT(ok);
1233         c.mapped = fields.at(1).toInt(&ok, 16);
1234         Q_ASSERT(ok);
1235         if (fields.at(3) == "3.2.0")
1236             c.version = QChar::Unicode_3_2;
1237         else if (fields.at(3) == "4.0.0")
1238             c.version = QChar::Unicode_4_0;
1239         else
1240             qFatal("unknown unicode version in NormalizationCorrection.txt");
1241
1242         out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1243              + ", " + QString::number(c.version) + " },\n";
1244         ++numCorrections;
1245         maxVersion = qMax(c.version, maxVersion);
1246     }
1247
1248     out += "};\n\n"
1249
1250            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1251            "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1252
1253     return out;
1254 }
1255
1256 static void readLineBreak()
1257 {
1258     qDebug("Reading LineBreak.txt");
1259
1260     QFile f("data/LineBreak.txt");
1261     if (!f.exists())
1262         qFatal("Couldn't find LineBreak.txt");
1263
1264     f.open(QFile::ReadOnly);
1265
1266     while (!f.atEnd()) {
1267         QByteArray line;
1268         line.resize(1024);
1269         int len = f.readLine(line.data(), 1024);
1270         line.resize(len-1);
1271
1272         int comment = line.indexOf('#');
1273         if (comment >= 0)
1274             line = line.left(comment);
1275         line.replace(" ", "");
1276
1277         if (line.isEmpty())
1278             continue;
1279
1280         QList<QByteArray> l = line.split(';');
1281         Q_ASSERT(l.size() == 2);
1282
1283         QByteArray codes = l[0];
1284         codes.replace("..", ".");
1285         QList<QByteArray> cl = codes.split('.');
1286
1287         bool ok;
1288         int from = cl[0].toInt(&ok, 16);
1289         Q_ASSERT(ok);
1290         int to = from;
1291         if (cl.size() == 2) {
1292             to = cl[1].toInt(&ok, 16);
1293             Q_ASSERT(ok);
1294         }
1295
1296         LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1297         if (lb == LineBreak_Unassigned)
1298             qFatal("unassigned line break class: %s", l[1].constData());
1299
1300         for (int codepoint = from; codepoint <= to; ++codepoint) {
1301             UnicodeData &d = UnicodeData::valueRef(codepoint);
1302             d.p.line_break_class = lb;
1303         }
1304     }
1305 }
1306
1307 static void readSpecialCasing()
1308 {
1309     qDebug("Reading SpecialCasing.txt");
1310
1311     QFile f("data/SpecialCasing.txt");
1312     if (!f.exists())
1313         qFatal("Couldn't find SpecialCasing.txt");
1314
1315     f.open(QFile::ReadOnly);
1316
1317     while (!f.atEnd()) {
1318         QByteArray line;
1319         line.resize(1024);
1320         int len = f.readLine(line.data(), 1024);
1321         line.resize(len-1);
1322
1323         int comment = line.indexOf('#');
1324         if (comment >= 0)
1325             line = line.left(comment);
1326
1327         if (line.isEmpty())
1328             continue;
1329
1330         QList<QByteArray> l = line.split(';');
1331
1332         QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1333         if (!condition.isEmpty())
1334             // #####
1335             continue;
1336
1337         bool ok;
1338         int codepoint = l[0].trimmed().toInt(&ok, 16);
1339         Q_ASSERT(ok);
1340
1341         // if the condition below doesn't hold anymore we need to modify our
1342         // lower/upper/title casing code and case folding code
1343         Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1344
1345 //         qDebug() << "codepoint" << hex << codepoint;
1346 //         qDebug() << line;
1347
1348         QList<QByteArray> lower = l[1].trimmed().split(' ');
1349         QList<int> lowerMap;
1350         for (int i = 0; i < lower.size(); ++i) {
1351             bool ok;
1352             lowerMap.append(lower.at(i).toInt(&ok, 16));
1353             Q_ASSERT(ok);
1354         }
1355
1356         QList<QByteArray> title = l[2].trimmed().split(' ');
1357         QList<int> titleMap;
1358         for (int i = 0; i < title.size(); ++i) {
1359             bool ok;
1360             titleMap.append(title.at(i).toInt(&ok, 16));
1361             Q_ASSERT(ok);
1362         }
1363
1364         QList<QByteArray> upper = l[3].trimmed().split(' ');
1365         QList<int> upperMap;
1366         for (int i = 0; i < upper.size(); ++i) {
1367             bool ok;
1368             upperMap.append(upper.at(i).toInt(&ok, 16));
1369             Q_ASSERT(ok);
1370         }
1371
1372
1373         UnicodeData &ud = UnicodeData::valueRef(codepoint);
1374         Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1375         Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1376         Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1377
1378         if (lowerMap.size() > 1) {
1379             ud.p.lowerCaseSpecial = true;
1380             ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1381         }
1382         if (titleMap.size() > 1) {
1383             ud.p.titleCaseSpecial = true;
1384             ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1385         }
1386         if (upperMap.size() > 1) {
1387             ud.p.upperCaseSpecial = true;
1388             ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1389         }
1390     }
1391 }
1392
1393 static int maxCaseFoldDiff = 0;
1394
1395 static void readCaseFolding()
1396 {
1397     qDebug("Reading CaseFolding.txt");
1398
1399     QFile f("data/CaseFolding.txt");
1400     if (!f.exists())
1401         qFatal("Couldn't find CaseFolding.txt");
1402
1403     f.open(QFile::ReadOnly);
1404
1405     while (!f.atEnd()) {
1406         QByteArray line;
1407         line.resize(1024);
1408         int len = f.readLine(line.data(), 1024);
1409         line.resize(len-1);
1410
1411         int comment = line.indexOf('#');
1412         if (comment >= 0)
1413             line = line.left(comment);
1414
1415         if (line.isEmpty())
1416             continue;
1417
1418         QList<QByteArray> l = line.split(';');
1419
1420         bool ok;
1421         int codepoint = l[0].trimmed().toInt(&ok, 16);
1422         Q_ASSERT(ok);
1423
1424
1425         l[1] = l[1].trimmed();
1426         if (l[1] == "F" || l[1] == "T")
1427             continue;
1428
1429 //         qDebug() << "codepoint" << hex << codepoint;
1430 //         qDebug() << line;
1431         QList<QByteArray> fold = l[2].trimmed().split(' ');
1432         QList<int> foldMap;
1433         for (int i = 0; i < fold.size(); ++i) {
1434             bool ok;
1435             foldMap.append(fold.at(i).toInt(&ok, 16));
1436             Q_ASSERT(ok);
1437         }
1438
1439         UnicodeData &ud = UnicodeData::valueRef(codepoint);
1440         if (foldMap.size() == 1) {
1441             int caseFolded = foldMap.at(0);
1442             int diff = caseFolded - codepoint;
1443             if (qAbs(diff) >= (1<<14)) {
1444                 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1445                 // if the condition below doesn't hold anymore we need to modify our special case folding code in qchar.cpp
1446                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(caseFolded));
1447                 ud.p.caseFoldSpecial = true;
1448                 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1449             } else {
1450                 ud.p.caseFoldDiff = diff;
1451                 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1452             }
1453             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(caseFolded)) {
1454                 // if the conditions below doesn't hold anymore we need to modify our case folding code
1455                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1456                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1457             }
1458 //            if (caseFolded != codepoint + ud.p.lowerCaseDiff)
1459 //                qDebug() << hex << codepoint;
1460         } else {
1461             qFatal("we currently don't support full case foldings");
1462 //             qDebug() << "special" << hex << foldMap;
1463             ud.p.caseFoldSpecial = true;
1464             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1465         }
1466     }
1467 }
1468
1469 static void readGraphemeBreak()
1470 {
1471     qDebug("Reading GraphemeBreakProperty.txt");
1472
1473     QFile f("data/GraphemeBreakProperty.txt");
1474     if (!f.exists())
1475         qFatal("Couldn't find GraphemeBreakProperty.txt");
1476
1477     f.open(QFile::ReadOnly);
1478
1479     while (!f.atEnd()) {
1480         QByteArray line;
1481         line.resize(1024);
1482         int len = f.readLine(line.data(), 1024);
1483         line.resize(len-1);
1484
1485         int comment = line.indexOf('#');
1486         if (comment >= 0)
1487             line = line.left(comment);
1488         line.replace(" ", "");
1489
1490         if (line.isEmpty())
1491             continue;
1492
1493         QList<QByteArray> l = line.split(';');
1494         Q_ASSERT(l.size() == 2);
1495
1496         QByteArray codes = l[0];
1497         codes.replace("..", ".");
1498         QList<QByteArray> cl = codes.split('.');
1499
1500         bool ok;
1501         int from = cl[0].toInt(&ok, 16);
1502         Q_ASSERT(ok);
1503         int to = from;
1504         if (cl.size() == 2) {
1505             to = cl[1].toInt(&ok, 16);
1506             Q_ASSERT(ok);
1507         }
1508
1509         GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1510         if (brk == GraphemeBreak_Unassigned)
1511             qFatal("unassigned grapheme break class: %s", l[1].constData());
1512
1513         for (int codepoint = from; codepoint <= to; ++codepoint) {
1514             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1515             ud.p.graphemeBreak = brk;
1516         }
1517     }
1518 }
1519
1520 static void readWordBreak()
1521 {
1522     qDebug("Reading WordBreakProperty.txt");
1523
1524     QFile f("data/WordBreakProperty.txt");
1525     if (!f.exists())
1526         qFatal("Couldn't find WordBreakProperty.txt");
1527
1528     f.open(QFile::ReadOnly);
1529
1530     while (!f.atEnd()) {
1531         QByteArray line;
1532         line.resize(1024);
1533         int len = f.readLine(line.data(), 1024);
1534         line.resize(len-1);
1535
1536         int comment = line.indexOf('#');
1537         if (comment >= 0)
1538             line = line.left(comment);
1539         line.replace(" ", "");
1540
1541         if (line.isEmpty())
1542             continue;
1543
1544         QList<QByteArray> l = line.split(';');
1545         Q_ASSERT(l.size() == 2);
1546
1547         QByteArray codes = l[0];
1548         codes.replace("..", ".");
1549         QList<QByteArray> cl = codes.split('.');
1550
1551         bool ok;
1552         int from = cl[0].toInt(&ok, 16);
1553         Q_ASSERT(ok);
1554         int to = from;
1555         if (cl.size() == 2) {
1556             to = cl[1].toInt(&ok, 16);
1557             Q_ASSERT(ok);
1558         }
1559
1560         WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned);
1561         if (brk == WordBreak_Unassigned)
1562             qFatal("unassigned word break class: %s", l[1].constData());
1563
1564         for (int codepoint = from; codepoint <= to; ++codepoint) {
1565             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1566             ud.p.wordBreak = brk;
1567         }
1568     }
1569 }
1570
1571 static void readSentenceBreak()
1572 {
1573     qDebug("Reading SentenceBreakProperty.txt");
1574
1575     QFile f("data/SentenceBreakProperty.txt");
1576     if (!f.exists())
1577         qFatal("Couldn't find SentenceBreakProperty.txt");
1578
1579     f.open(QFile::ReadOnly);
1580
1581     while (!f.atEnd()) {
1582         QByteArray line;
1583         line.resize(1024);
1584         int len = f.readLine(line.data(), 1024);
1585         line.resize(len-1);
1586
1587         int comment = line.indexOf('#');
1588         if (comment >= 0)
1589             line = line.left(comment);
1590         line.replace(" ", "");
1591
1592         if (line.isEmpty())
1593             continue;
1594
1595         QList<QByteArray> l = line.split(';');
1596         Q_ASSERT(l.size() == 2);
1597
1598         QByteArray codes = l[0];
1599         codes.replace("..", ".");
1600         QList<QByteArray> cl = codes.split('.');
1601
1602         bool ok;
1603         int from = cl[0].toInt(&ok, 16);
1604         Q_ASSERT(ok);
1605         int to = from;
1606         if (cl.size() == 2) {
1607             to = cl[1].toInt(&ok, 16);
1608             Q_ASSERT(ok);
1609         }
1610
1611         SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
1612         if (brk == SentenceBreak_Unassigned)
1613             qFatal("unassigned sentence break class: %s", l[1].constData());
1614
1615         for (int codepoint = from; codepoint <= to; ++codepoint) {
1616             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1617             ud.p.sentenceBreak = brk;
1618         }
1619     }
1620 }
1621
1622 #if 0
1623 // this piece of code does full case folding and comparison. We currently
1624 // don't use it, since this gives lots of issues with things as case insensitive
1625 // search and replace.
1626 static inline void foldCase(uint ch, ushort *out)
1627 {
1628     const QUnicodeTables::Properties *p = qGetProp(ch);
1629     if (!p->caseFoldSpecial) {
1630         *(out++) = ch + p->caseFoldDiff;
1631     } else {
1632         const ushort *folded = specialCaseMap + p->caseFoldDiff;
1633         ushort length = *folded++;
1634         while (length--)
1635             *out++ = *folded++;
1636     }
1637     *out = 0;
1638 }
1639
1640 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1641 {
1642     if (a == b)
1643         return 0;
1644     if (a == 0)
1645         return 1;
1646     if (b == 0)
1647         return -1;
1648
1649     while (a != ae && b != be) {
1650         const QUnicodeTables::Properties *pa = qGetProp(*a);
1651         const QUnicodeTables::Properties *pb = qGetProp(*b);
1652         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1653             goto special;
1654             int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1655         if ((diff))
1656             return diff;
1657         ++a;
1658         ++b;
1659         }
1660     }
1661     if (a == ae) {
1662         if (b == be)
1663             return 0;
1664         return -1;
1665     }
1666     return 1;
1667 special:
1668     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1669     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1670     abuf[0] = bbuf[0] = 0;
1671     ushort *ap = abuf;
1672     ushort *bp = bbuf;
1673     while (1) {
1674         if (!*ap) {
1675             if (a == ae) {
1676                 if (!*bp && b == be)
1677                     return 0;
1678                 return -1;
1679             }
1680             foldCase(*(a++), abuf);
1681             ap = abuf;
1682         }
1683         if (!*bp) {
1684             if (b == be)
1685                 return 1;
1686             foldCase(*(b++), bbuf);
1687             bp = bbuf;
1688         }
1689         if (*ap != *bp)
1690             return (int)*ap - (int)*bp;
1691         ++ap;
1692         ++bp;
1693     }
1694 }
1695
1696
1697 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1698 {
1699     if (a == 0)
1700         return 1;
1701     if (b == 0)
1702         return -1;
1703
1704     while (a != ae && *b) {
1705         const QUnicodeTables::Properties *pa = qGetProp(*a);
1706         const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1707         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1708             goto special;
1709         int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1710         if ((diff))
1711             return diff;
1712         ++a;
1713         ++b;
1714     }
1715     if (a == ae) {
1716         if (!*b)
1717             return 0;
1718         return -1;
1719     }
1720     return 1;
1721
1722 special:
1723     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1724     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1725     abuf[0] = bbuf[0] = 0;
1726     ushort *ap = abuf;
1727     ushort *bp = bbuf;
1728     while (1) {
1729         if (!*ap) {
1730             if (a == ae) {
1731                 if (!*bp && !*b)
1732                     return 0;
1733                 return -1;
1734             }
1735             foldCase(*(a++), abuf);
1736             ap = abuf;
1737         }
1738         if (!*bp) {
1739             if (!*b)
1740                 return 1;
1741             foldCase(*(b++), bbuf);
1742             bp = bbuf;
1743         }
1744         if (*ap != *bp)
1745             return (int)*ap - (int)*bp;
1746         ++ap;
1747         ++bp;
1748     }
1749 }
1750 #endif
1751
1752 #if 0
1753 static QList<QByteArray> blockNames;
1754 struct BlockInfo
1755 {
1756     int blockIndex;
1757     int firstCodePoint;
1758     int lastCodePoint;
1759 };
1760 static QList<BlockInfo> blockInfoList;
1761
1762 static void readBlocks()
1763 {
1764     qDebug("Reading Blocks.txt");
1765
1766     QFile f("data/Blocks.txt");
1767     if (!f.exists())
1768         qFatal("Couldn't find Blocks.txt");
1769
1770     f.open(QFile::ReadOnly);
1771
1772     while (!f.atEnd()) {
1773         QByteArray line = f.readLine();
1774         line.resize(line.size() - 1);
1775
1776         int comment = line.indexOf("#");
1777         if (comment >= 0)
1778             line = line.left(comment);
1779
1780         line.replace(" ", "");
1781
1782         if (line.isEmpty())
1783             continue;
1784
1785         int semicolon = line.indexOf(';');
1786         Q_ASSERT(semicolon >= 0);
1787         QByteArray codePoints = line.left(semicolon);
1788         QByteArray blockName = line.mid(semicolon + 1);
1789
1790         int blockIndex = blockNames.indexOf(blockName);
1791         if (blockIndex == -1) {
1792             blockIndex = blockNames.size();
1793             blockNames.append(blockName);
1794         }
1795
1796         codePoints.replace("..", ".");
1797         QList<QByteArray> cl = codePoints.split('.');
1798
1799         bool ok;
1800         int first = cl[0].toInt(&ok, 16);
1801         Q_ASSERT(ok);
1802         int last = first;
1803         if (cl.size() == 2) {
1804             last = cl[1].toInt(&ok, 16);
1805             Q_ASSERT(ok);
1806         }
1807
1808         BlockInfo blockInfo = { blockIndex, first, last };
1809         blockInfoList.append(blockInfo);
1810     }
1811 }
1812 #endif
1813
1814 static QList<QByteArray> scriptNames;
1815 static QList<int> scriptMap;
1816
1817 static const char *specialScripts[] = {
1818     "Common",
1819     "Greek",
1820     "Cyrillic",
1821     "Armenian",
1822     "Hebrew",
1823     "Arabic",
1824     "Syriac",
1825     "Thaana",
1826     "Devanagari",
1827     "Bengali",
1828     "Gurmukhi",
1829     "Gujarati",
1830     "Oriya",
1831     "Tamil",
1832     "Telugu",
1833     "Kannada",
1834     "Malayalam",
1835     "Sinhala",
1836     "Thai",
1837     "Lao",
1838     "Tibetan",
1839     "Myanmar",
1840     "Georgian",
1841     "Hangul",
1842     "Ogham",
1843     "Runic",
1844     "Khmer",
1845     "Nko",
1846     "Inherited"
1847 };
1848 enum { specialScriptsCount = sizeof(specialScripts) / sizeof(const char *) };
1849
1850 static void readScripts()
1851 {
1852     qDebug("Reading Scripts.txt");
1853
1854     QFile f("data/Scripts.txt");
1855     if (!f.exists())
1856         qFatal("Couldn't find Scripts.txt");
1857
1858     f.open(QFile::ReadOnly);
1859
1860     int scriptsCount = specialScriptsCount;
1861     // ### preserve the old ordering (temporary)
1862     for (int i = 0; i < specialScriptsCount; ++i) {
1863         scriptNames.append(specialScripts[i]);
1864         scriptMap.append(i);
1865     }
1866
1867     while (!f.atEnd()) {
1868         QByteArray line = f.readLine();
1869         line.resize(line.size() - 1);
1870
1871         int comment = line.indexOf("#");
1872         if (comment >= 0)
1873             line = line.left(comment);
1874
1875         line.replace(" ", "");
1876         line.replace("_", "");
1877
1878         if (line.isEmpty())
1879             continue;
1880
1881         int semicolon = line.indexOf(';');
1882         Q_ASSERT(semicolon >= 0);
1883         QByteArray codePoints = line.left(semicolon);
1884         QByteArray scriptName = line.mid(semicolon + 1);
1885
1886         codePoints.replace("..", ".");
1887         QList<QByteArray> cl = codePoints.split('.');
1888
1889         bool ok;
1890         int first = cl[0].toInt(&ok, 16);
1891         Q_ASSERT(ok);
1892         int last = first;
1893         if (cl.size() == 2) {
1894             last = cl[1].toInt(&ok, 16);
1895             Q_ASSERT(ok);
1896         }
1897
1898         int scriptIndex = scriptNames.indexOf(scriptName);
1899         if (scriptIndex == -1) {
1900             scriptIndex = scriptNames.size();
1901             scriptNames.append(scriptName);
1902
1903             // is the script alias for 'Common'?
1904             int s = specialScriptsCount;
1905             while (--s > 0) {
1906                 if (scriptName == specialScripts[s])
1907                     break;
1908             }
1909             scriptMap.append(s > 0 ? scriptsCount++ : 0);
1910         }
1911
1912         for (int codepoint = first; codepoint <= last; ++codepoint) {
1913             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1914             ud.p.script = scriptMap.at(scriptIndex);
1915         }
1916     }
1917 }
1918
1919 static QByteArray createScriptEnumDeclaration()
1920 {
1921     QByteArray declaration;
1922
1923     declaration += "// See http://www.unicode.org/reports/tr24/tr24-5.html\n";
1924     declaration += "enum Script {\n    Common";
1925
1926     // output the ones with special processing first
1927     for (int i = 1; i < scriptNames.size(); ++i) {
1928         if (scriptMap.at(i) == 0)
1929             continue;
1930         declaration += ",\n    ";
1931         declaration += scriptNames.at(i);
1932     }
1933     declaration += ",\n    ScriptCount = Inherited";
1934
1935     // output the ones that are an alias for 'Common'
1936     for (int i = 1; i < scriptNames.size(); ++i) {
1937         if (scriptMap.at(i) != 0)
1938             continue;
1939         declaration += ",\n    ";
1940         declaration += scriptNames.at(i);
1941         declaration += " = Common";
1942     }
1943
1944     declaration += "\n};\n\n";
1945
1946     return declaration;
1947 }
1948
1949 #if 0
1950 static void dump(int from, int to)
1951 {
1952     for (int i = from; i <= to; ++i) {
1953         UnicodeData &d = UnicodeData::valueRef(i);
1954         qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
1955                i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
1956         if (d.decompositionType != QChar::NoDecomposition) {
1957             qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
1958                    d.decomposition[0]);
1959         }
1960     }
1961     qDebug(" ");
1962 }
1963 #endif
1964
1965 static QList<PropertyFlags> uniqueProperties;
1966
1967 static void computeUniqueProperties()
1968 {
1969     qDebug("computeUniqueProperties:");
1970     for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1971         UnicodeData &d = UnicodeData::valueRef(codepoint);
1972         int index = uniqueProperties.indexOf(d.p);
1973         if (index == -1) {
1974             index = uniqueProperties.size();
1975             uniqueProperties.append(d.p);
1976         }
1977         d.propertyIndex = index;
1978     }
1979     qDebug("    %d unique unicode properties found", uniqueProperties.size());
1980 }
1981
1982 struct UniqueBlock {
1983     inline UniqueBlock() : index(-1) {}
1984
1985     inline bool operator==(const UniqueBlock &other) const
1986     { return values == other.values; }
1987
1988     int index;
1989     QVector<int> values;
1990 };
1991
1992 static QByteArray createPropertyInfo()
1993 {
1994     qDebug("createPropertyInfo:");
1995
1996     // we reserve one bit more than in the assert below for the sign
1997     Q_ASSERT(maxMirroredDiff < (1<<12));
1998     Q_ASSERT(maxLowerCaseDiff < (1<<14));
1999     Q_ASSERT(maxUpperCaseDiff < (1<<14));
2000     Q_ASSERT(maxTitleCaseDiff < (1<<14));
2001     Q_ASSERT(maxCaseFoldDiff < (1<<14));
2002
2003     const int BMP_BLOCKSIZE = 32;
2004     const int BMP_SHIFT = 5;
2005     const int BMP_END = 0x11000;
2006     const int SMP_END = 0x110000;
2007     const int SMP_BLOCKSIZE = 256;
2008     const int SMP_SHIFT = 8;
2009
2010     QList<UniqueBlock> uniqueBlocks;
2011     QVector<int> blockMap;
2012     int used = 0;
2013
2014     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2015         UniqueBlock b;
2016         b.values.reserve(BMP_BLOCKSIZE);
2017         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2018             int uc = block*BMP_BLOCKSIZE + i;
2019             UnicodeData &d = UnicodeData::valueRef(uc);
2020             b.values.append(d.propertyIndex);
2021         }
2022         int index = uniqueBlocks.indexOf(b);
2023         if (index == -1) {
2024             index = uniqueBlocks.size();
2025             b.index = used;
2026             used += BMP_BLOCKSIZE;
2027             uniqueBlocks.append(b);
2028         }
2029         blockMap.append(uniqueBlocks.at(index).index);
2030     }
2031     int bmp_blocks = uniqueBlocks.size();
2032
2033     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2034         UniqueBlock b;
2035         b.values.reserve(SMP_BLOCKSIZE);
2036         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2037             int uc = block*SMP_BLOCKSIZE + i;
2038             UnicodeData &d = UnicodeData::valueRef(uc);
2039             b.values.append(d.propertyIndex);
2040         }
2041         int index = uniqueBlocks.indexOf(b);
2042         if (index == -1) {
2043             index = uniqueBlocks.size();
2044             b.index = used;
2045             used += SMP_BLOCKSIZE;
2046             uniqueBlocks.append(b);
2047         }
2048         blockMap.append(uniqueBlocks.at(index).index);
2049     }
2050     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2051
2052     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2053     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2054     int bmp_mem = bmp_block_data + bmp_trie;
2055     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2056     qDebug("        block data uses: %d bytes", bmp_block_data);
2057     qDebug("        trie data uses : %d bytes", bmp_trie);
2058
2059     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2060     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2061     int smp_mem = smp_block_data + smp_trie;
2062     qDebug("    %d unique blocks in SMP.", smp_blocks);
2063     qDebug("        block data uses: %d bytes", smp_block_data);
2064     qDebug("        trie data uses : %d bytes", smp_trie);
2065
2066     int prop_data = uniqueProperties.size() * SizeOfPropertiesStruct;
2067     qDebug("\n        properties data uses : %d bytes", prop_data);
2068     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + prop_data);
2069
2070     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2071
2072     QByteArray out;
2073
2074     out += "static const unsigned short uc_property_trie[] = {\n";
2075     // first write the map
2076     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2077     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2078         if (!(i % 8)) {
2079             if (out.endsWith(' '))
2080                 out.chop(1);
2081             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2082                 out += "\n";
2083             out += "\n    ";
2084         }
2085         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2086         out += ", ";
2087     }
2088     if (out.endsWith(' '))
2089         out.chop(1);
2090     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2091     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2092         if (!(i % 8)) {
2093             if (out.endsWith(' '))
2094                 out.chop(1);
2095             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2096                 out += "\n";
2097             out += "\n    ";
2098         }
2099         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2100         out += ", ";
2101     }
2102     if (out.endsWith(' '))
2103         out.chop(1);
2104     out += "\n";
2105     // write the data
2106     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2107         if (out.endsWith(' '))
2108             out.chop(1);
2109         out += "\n";
2110         const UniqueBlock &b = uniqueBlocks.at(i);
2111         for (int j = 0; j < b.values.size(); ++j) {
2112             if (!(j % 8)) {
2113                 if (out.endsWith(' '))
2114                     out.chop(1);
2115                 out += "\n    ";
2116             }
2117             out += QByteArray::number(b.values.at(j));
2118             out += ", ";
2119         }
2120     }
2121     if (out.endsWith(' '))
2122         out.chop(1);
2123     out += "\n};\n\n";
2124
2125     out += "#define GET_PROP_INDEX(ucs4) \\\n"
2126            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2127            "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2128            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2129            "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2130            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2131            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2132            "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2133            "       (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2134            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n";
2135
2136     out += "static const Properties uc_properties[] = {";
2137     // keep in sync with the property declaration
2138     for (int i = 0; i < uniqueProperties.size(); ++i) {
2139         const PropertyFlags &p = uniqueProperties.at(i);
2140         out += "\n    { ";
2141 //     "        ushort category            : 8; /* 5 used */\n"
2142         out += QByteArray::number( p.category );
2143         out += ", ";
2144 //     "        ushort direction           : 8; /* 5 used */\n"
2145         out += QByteArray::number( p.direction );
2146         out += ", ";
2147 //     "        ushort combiningClass      : 8;\n"
2148         out += QByteArray::number( p.combiningClass );
2149         out += ", ";
2150 //     "        ushort joining             : 2;\n"
2151         out += QByteArray::number( p.joining );
2152         out += ", ";
2153 //     "        signed short digitValue    : 6; /* 5 used */\n"
2154         out += QByteArray::number( p.digitValue );
2155         out += ", ";
2156 //     "        signed short mirrorDiff    : 16;\n"
2157 //     "        signed short lowerCaseDiff : 16;\n"
2158 //     "        signed short upperCaseDiff : 16;\n"
2159 //     "        signed short titleCaseDiff : 16;\n"
2160 //     "        signed short caseFoldDiff  : 16;\n"
2161         out += QByteArray::number( p.mirrorDiff );
2162         out += ", ";
2163         out += QByteArray::number( p.lowerCaseDiff );
2164         out += ", ";
2165         out += QByteArray::number( p.upperCaseDiff );
2166         out += ", ";
2167         out += QByteArray::number( p.titleCaseDiff );
2168         out += ", ";
2169         out += QByteArray::number( p.caseFoldDiff );
2170         out += ", ";
2171 //     "        ushort lowerCaseSpecial    : 1;\n"
2172 //     "        ushort upperCaseSpecial    : 1;\n"
2173 //     "        ushort titleCaseSpecial    : 1;\n"
2174 //     "        ushort caseFoldSpecial     : 1;\n"
2175         out += QByteArray::number( p.lowerCaseSpecial );
2176         out += ", ";
2177         out += QByteArray::number( p.upperCaseSpecial );
2178         out += ", ";
2179         out += QByteArray::number( p.titleCaseSpecial );
2180         out += ", ";
2181         out += QByteArray::number( p.caseFoldSpecial );
2182         out += ", ";
2183 //     "        ushort unicodeVersion      : 4;\n"
2184         out += QByteArray::number( p.age );
2185         out += ", ";
2186 //     "        ushort graphemeBreak       : 8; /* 4 used */\n"
2187 //     "        ushort wordBreak           : 8; /* 4 used */\n"
2188 //     "        ushort sentenceBreak       : 8; /* 4 used */\n"
2189 //     "        ushort line_break_class    : 8; /* 6 used */\n"
2190         out += QByteArray::number( p.graphemeBreak );
2191         out += ", ";
2192         out += QByteArray::number( p.wordBreak );
2193         out += ", ";
2194         out += QByteArray::number( p.sentenceBreak );
2195         out += ", ";
2196         out += QByteArray::number( p.line_break_class );
2197         out += ", ";
2198 //     "        ushort script              : 8; /* 5 used */\n"
2199         out += QByteArray::number( p.script );
2200         out += " },";
2201     }
2202     out.chop(1);
2203     out += "\n};\n\n";
2204
2205
2206     out += "static inline const Properties *qGetProp(uint ucs4)\n"
2207            "{\n"
2208            "    const int index = GET_PROP_INDEX(ucs4);\n"
2209            "    return uc_properties + index;\n"
2210            "}\n"
2211            "\n"
2212            "static inline const Properties *qGetProp(ushort ucs2)\n"
2213            "{\n"
2214            "    const int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2215            "    return uc_properties + index;\n"
2216            "}\n"
2217            "\n"
2218            "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4)\n"
2219            "{\n"
2220            "    return qGetProp(ucs4);\n"
2221            "}\n"
2222            "\n"
2223            "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2)\n"
2224            "{\n"
2225            "    return qGetProp(ucs2);\n"
2226            "}\n\n";
2227
2228     out += "Q_CORE_EXPORT GraphemeBreak QT_FASTCALL graphemeBreakClass(uint ucs4)\n"
2229            "{\n"
2230            "    return (GraphemeBreak)qGetProp(ucs4)->graphemeBreak;\n"
2231            "}\n"
2232            "\n"
2233            "Q_CORE_EXPORT WordBreak QT_FASTCALL wordBreakClass(uint ucs4)\n"
2234            "{\n"
2235            "    return (WordBreak)qGetProp(ucs4)->wordBreak;\n"
2236            "}\n"
2237            "\n"
2238            "Q_CORE_EXPORT SentenceBreak QT_FASTCALL sentenceBreakClass(uint ucs4)\n"
2239            "{\n"
2240            "    return (SentenceBreak)qGetProp(ucs4)->sentenceBreak;\n"
2241            "}\n"
2242            "\n"
2243            "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4)\n"
2244            "{\n"
2245            "    return (LineBreakClass)qGetProp(ucs4)->line_break_class;\n"
2246            "}\n"
2247            "\n"
2248            "Q_CORE_EXPORT Script QT_FASTCALL script(uint ucs4)\n"
2249            "{\n"
2250            "    return (Script)qGetProp(ucs4)->script;\n"
2251            "}\n\n";
2252
2253     return out;
2254 }
2255
2256 static QByteArray createSpecialCaseMap()
2257 {
2258     qDebug("createSpecialCaseMap:");
2259
2260     QByteArray out;
2261
2262     out += "static const ushort specialCaseMap[] = {\n"
2263            "    0x0, // placeholder";
2264     int i = 1;
2265     while (i < specialCaseMap.size()) {
2266         out += "\n   ";
2267         int n = specialCaseMap.at(i);
2268         for (int j = 0; j <= n; ++j) {
2269             out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2270             out += ",";
2271         }
2272         i += n + 1;
2273     }
2274     out.chop(1);
2275     out += "\n};\n\n";
2276
2277     qDebug("    memory usage: %d bytes", specialCaseMap.size()*sizeof(unsigned short));
2278
2279     return out;
2280 }
2281
2282
2283 static QByteArray createCompositionInfo()
2284 {
2285     qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2286
2287     const int BMP_BLOCKSIZE = 16;
2288     const int BMP_SHIFT = 4;
2289     const int BMP_END = 0x3400; // start of Han
2290     const int SMP_END = 0x30000;
2291     const int SMP_BLOCKSIZE = 256;
2292     const int SMP_SHIFT = 8;
2293
2294     if (SMP_END <= highestComposedCharacter)
2295         qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2296
2297     QVector<unsigned short> decompositions;
2298     int tableIndex = 0;
2299
2300     QList<UniqueBlock> uniqueBlocks;
2301     QVector<int> blockMap;
2302     int used = 0;
2303
2304     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2305         UniqueBlock b;
2306         b.values.reserve(BMP_BLOCKSIZE);
2307         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2308             int uc = block*BMP_BLOCKSIZE + i;
2309             UnicodeData &d = UnicodeData::valueRef(uc);
2310             if (!d.decomposition.isEmpty()) {
2311                 int utf16Length = 0;
2312                 decompositions.append(0);
2313                 for (int j = 0; j < d.decomposition.size(); ++j) {
2314                     int code = d.decomposition.at(j);
2315                     if (QChar::requiresSurrogates(code)) {
2316                         // save as surrogate pair
2317                         decompositions.append(QChar::highSurrogate(code));
2318                         decompositions.append(QChar::lowSurrogate(code));
2319                         utf16Length += 2;
2320                     } else {
2321                         decompositions.append(code);
2322                         utf16Length++;
2323                     }
2324                 }
2325                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2326                 b.values.append(tableIndex);
2327                 tableIndex += utf16Length + 1;
2328             } else {
2329                 b.values.append(0xffff);
2330             }
2331         }
2332         int index = uniqueBlocks.indexOf(b);
2333         if (index == -1) {
2334             index = uniqueBlocks.size();
2335             b.index = used;
2336             used += BMP_BLOCKSIZE;
2337             uniqueBlocks.append(b);
2338         }
2339         blockMap.append(uniqueBlocks.at(index).index);
2340     }
2341     int bmp_blocks = uniqueBlocks.size();
2342
2343     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2344         UniqueBlock b;
2345         b.values.reserve(SMP_BLOCKSIZE);
2346         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2347             int uc = block*SMP_BLOCKSIZE + i;
2348             UnicodeData &d = UnicodeData::valueRef(uc);
2349             if (!d.decomposition.isEmpty()) {
2350                 int utf16Length = 0;
2351                 decompositions.append(0);
2352                 for (int j = 0; j < d.decomposition.size(); ++j) {
2353                     int code = d.decomposition.at(j);
2354                     if (QChar::requiresSurrogates(code)) {
2355                         // save as surrogate pair
2356                         decompositions.append(QChar::highSurrogate(code));
2357                         decompositions.append(QChar::lowSurrogate(code));
2358                         utf16Length += 2;
2359                     } else {
2360                         decompositions.append(code);
2361                         utf16Length++;
2362                     }
2363                 }
2364                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2365                 b.values.append(tableIndex);
2366                 tableIndex += utf16Length + 1;
2367             } else {
2368                 b.values.append(0xffff);
2369             }
2370         }
2371         int index = uniqueBlocks.indexOf(b);
2372         if (index == -1) {
2373             index = uniqueBlocks.size();
2374             b.index = used;
2375             used += SMP_BLOCKSIZE;
2376             uniqueBlocks.append(b);
2377         }
2378         blockMap.append(uniqueBlocks.at(index).index);
2379     }
2380     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2381
2382     // if the condition below doesn't hold anymore we need to modify our decomposition code
2383     Q_ASSERT(tableIndex < 0xffff);
2384
2385     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2386     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2387     int bmp_mem = bmp_block_data + bmp_trie;
2388     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2389     qDebug("        block data uses: %d bytes", bmp_block_data);
2390     qDebug("        trie data uses : %d bytes", bmp_trie);
2391
2392     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2393     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2394     int smp_mem = smp_block_data + smp_trie;
2395     qDebug("    %d unique blocks in SMP.", smp_blocks);
2396     qDebug("        block data uses: %d bytes", smp_block_data);
2397     qDebug("        trie data uses : %d bytes", smp_trie);
2398
2399     int decomposition_data = decompositions.size() * 2;
2400     qDebug("\n        decomposition data uses : %d bytes", decomposition_data);
2401     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + decomposition_data);
2402
2403     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2404
2405     QByteArray out;
2406
2407     out += "static const unsigned short uc_decomposition_trie[] = {\n";
2408     // first write the map
2409     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2410     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2411         if (!(i % 8)) {
2412             if (out.endsWith(' '))
2413                 out.chop(1);
2414             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2415                 out += "\n";
2416             out += "\n    ";
2417         }
2418         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2419         out += ", ";
2420     }
2421     if (out.endsWith(' '))
2422         out.chop(1);
2423     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2424     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2425         if (!(i % 8)) {
2426             if (out.endsWith(' '))
2427                 out.chop(1);
2428             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2429                 out += "\n";
2430             out += "\n    ";
2431         }
2432         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2433         out += ", ";
2434     }
2435     if (out.endsWith(' '))
2436         out.chop(1);
2437     out += "\n";
2438     // write the data
2439     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2440         if (out.endsWith(' '))
2441             out.chop(1);
2442         out += "\n";
2443         const UniqueBlock &b = uniqueBlocks.at(i);
2444         for (int j = 0; j < b.values.size(); ++j) {
2445             if (!(j % 8)) {
2446                 if (out.endsWith(' '))
2447                     out.chop(1);
2448                 out += "\n    ";
2449             }
2450             out += "0x" + QByteArray::number(b.values.at(j), 16);
2451             out += ", ";
2452         }
2453     }
2454     if (out.endsWith(' '))
2455         out.chop(2);
2456     out += "\n};\n\n";
2457
2458     out += "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2459            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2460            "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2461            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2462            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2463            "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2464            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2465            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2466            "           : 0xffff))\n\n";
2467
2468     out += "static const unsigned short uc_decomposition_map[] = {";
2469     for (int i = 0; i < decompositions.size(); ++i) {
2470         if (!(i % 8)) {
2471             if (out.endsWith(' '))
2472                 out.chop(1);
2473             out += "\n    ";
2474         }
2475         out += "0x" + QByteArray::number(decompositions.at(i), 16);
2476         out += ", ";
2477     }
2478     if (out.endsWith(' '))
2479         out.chop(2);
2480     out += "\n};\n\n";
2481
2482     return out;
2483 }
2484
2485 static QByteArray createLigatureInfo()
2486 {
2487     qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2488
2489     for (int i = 0; i < ligatureHashes.size(); ++i) {
2490         const QList<Ligature> &l = ligatureHashes.value(i);
2491         for (int j = 0; j < l.size(); ++j) {
2492             // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2493             Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2494                      QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2495         }
2496     }
2497
2498     const int BMP_BLOCKSIZE = 32;
2499     const int BMP_SHIFT = 5;
2500     const int BMP_END = 0x3100;
2501     const int SMP_END = 0x12000;
2502     const int SMP_BLOCKSIZE = 256;
2503     const int SMP_SHIFT = 8;
2504
2505     if (SMP_END <= highestLigature)
2506         qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2507
2508     QList<unsigned short> ligatures;
2509     int tableIndex = 0;
2510
2511     QList<UniqueBlock> uniqueBlocks;
2512     QVector<int> blockMap;
2513     int used = 0;
2514
2515     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2516         UniqueBlock b;
2517         b.values.reserve(BMP_BLOCKSIZE);
2518         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2519             int uc = block*BMP_BLOCKSIZE + i;
2520             QList<Ligature> l = ligatureHashes.value(uc);
2521             if (!l.isEmpty()) {
2522                 Q_ASSERT(!QChar::requiresSurrogates(uc));
2523                 qSort(l); // needed for bsearch in ligatureHelper code
2524
2525                 ligatures.append(l.size());
2526                 for (int j = 0; j < l.size(); ++j) {
2527                     ligatures.append(l.at(j).u1);
2528                     ligatures.append(l.at(j).ligature);
2529                 }
2530                 b.values.append(tableIndex);
2531                 tableIndex += 2*l.size() + 1;
2532             } else {
2533                 b.values.append(0xffff);
2534             }
2535         }
2536         int index = uniqueBlocks.indexOf(b);
2537         if (index == -1) {
2538             index = uniqueBlocks.size();
2539             b.index = used;
2540             used += BMP_BLOCKSIZE;
2541             uniqueBlocks.append(b);
2542         }
2543         blockMap.append(uniqueBlocks.at(index).index);
2544     }
2545     int bmp_blocks = uniqueBlocks.size();
2546
2547     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2548         UniqueBlock b;
2549         b.values.reserve(SMP_BLOCKSIZE);
2550         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2551             int uc = block*SMP_BLOCKSIZE + i;
2552             QList<Ligature> l = ligatureHashes.value(uc);
2553             if (!l.isEmpty()) {
2554                 Q_ASSERT(QChar::requiresSurrogates(uc));
2555                 qSort(l); // needed for bsearch in ligatureHelper code
2556
2557                 ligatures.append(l.size());
2558                 for (int j = 0; j < l.size(); ++j) {
2559                     ligatures.append(QChar::highSurrogate(l.at(j).u1));
2560                     ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2561                     ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2562                     ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2563                 }
2564                 b.values.append(tableIndex);
2565                 tableIndex += 4*l.size() + 1;
2566             } else {
2567                 b.values.append(0xffff);
2568             }
2569         }
2570         int index = uniqueBlocks.indexOf(b);
2571         if (index == -1) {
2572             index = uniqueBlocks.size();
2573             b.index = used;
2574             used += SMP_BLOCKSIZE;
2575             uniqueBlocks.append(b);
2576         }
2577         blockMap.append(uniqueBlocks.at(index).index);
2578     }
2579     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2580
2581     // if the condition below doesn't hold anymore we need to modify our composition code
2582     Q_ASSERT(tableIndex < 0xffff);
2583
2584     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2585     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2586     int bmp_mem = bmp_block_data + bmp_trie;
2587     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2588     qDebug("        block data uses: %d bytes", bmp_block_data);
2589     qDebug("        trie data uses : %d bytes", bmp_trie);
2590
2591     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2592     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2593     int smp_mem = smp_block_data + smp_trie;
2594     qDebug("    %d unique blocks in SMP.", smp_blocks);
2595     qDebug("        block data uses: %d bytes", smp_block_data);
2596     qDebug("        trie data uses : %d bytes", smp_trie);
2597
2598     int ligature_data = ligatures.size() * 2;
2599     qDebug("\n        ligature data uses : %d bytes", ligature_data);
2600     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + ligature_data);
2601
2602     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2603
2604     QByteArray out;
2605
2606     out += "static const unsigned short uc_ligature_trie[] = {\n";
2607     // first write the map
2608     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2609     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2610         if (!(i % 8)) {
2611             if (out.endsWith(' '))
2612                 out.chop(1);
2613             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2614                 out += "\n";
2615             out += "\n    ";
2616         }
2617         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2618         out += ", ";
2619     }
2620     if (out.endsWith(' '))
2621         out.chop(1);
2622     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2623     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2624         if (!(i % 8)) {
2625             if (out.endsWith(' '))
2626                 out.chop(1);
2627             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2628                 out += "\n";
2629             out += "\n    ";
2630         }
2631         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2632         out += ", ";
2633     }
2634     if (out.endsWith(' '))
2635         out.chop(1);
2636     out += "\n";
2637     // write the data
2638     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2639         if (out.endsWith(' '))
2640             out.chop(1);
2641         out += "\n";
2642         const UniqueBlock &b = uniqueBlocks.at(i);
2643         for (int j = 0; j < b.values.size(); ++j) {
2644             if (!(j % 8)) {
2645                 if (out.endsWith(' '))
2646                     out.chop(1);
2647                 out += "\n    ";
2648             }
2649             out += "0x" + QByteArray::number(b.values.at(j), 16);
2650             out += ", ";
2651         }
2652     }
2653     if (out.endsWith(' '))
2654         out.chop(2);
2655     out += "\n};\n\n";
2656
2657     out += "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2658            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2659            "        ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2660            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2661            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2662            "           ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2663            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2664            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2665            "           : 0xffff))\n\n";
2666
2667     out += "static const unsigned short uc_ligature_map[] = {";
2668     for (int i = 0; i < ligatures.size(); ++i) {
2669         if (!(i % 8)) {
2670             if (out.endsWith(' '))
2671                 out.chop(1);
2672             out += "\n    ";
2673         }
2674         out += "0x" + QByteArray::number(ligatures.at(i), 16);
2675         out += ", ";
2676     }
2677     if (out.endsWith(' '))
2678         out.chop(2);
2679     out += "\n};\n\n";
2680
2681     return out;
2682 }
2683
2684 QByteArray createCasingInfo()
2685 {
2686     QByteArray out;
2687
2688     out += "struct CasingInfo {\n"
2689            "    uint codePoint : 16;\n"
2690            "    uint flags : 8;\n"
2691            "    uint offset : 8;\n"
2692            "};\n\n";
2693
2694     return out;
2695 }
2696
2697
2698 int main(int, char **)
2699 {
2700     initAgeMap();
2701     initCategoryMap();
2702     initDecompositionMap();
2703     initDirectionMap();
2704     initJoiningMap();
2705     initGraphemeBreak();
2706     initWordBreak();
2707     initSentenceBreak();
2708     initLineBreak();
2709
2710     readUnicodeData();
2711     readBidiMirroring();
2712     readArabicShaping();
2713     readDerivedAge();
2714     readDerivedNormalizationProps();
2715     readSpecialCasing();
2716     readCaseFolding();
2717     // readBlocks();
2718     readScripts();
2719     readGraphemeBreak();
2720     readWordBreak();
2721     readSentenceBreak();
2722     readLineBreak();
2723
2724     computeUniqueProperties();
2725     QByteArray properties = createPropertyInfo();
2726     QByteArray specialCases = createSpecialCaseMap();
2727     QByteArray compositions = createCompositionInfo();
2728     QByteArray ligatures = createLigatureInfo();
2729     QByteArray normalizationCorrections = createNormalizationCorrections();
2730     QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2731
2732     QByteArray header =
2733         "/****************************************************************************\n"
2734         "**\n"
2735         "** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).\n"
2736         "** Contact: http://www.qt-project.org/\n"
2737         "**\n"
2738         "** This file is part of the QtCore module of the Qt Toolkit.\n"
2739         "**\n"
2740         "** $QT_BEGIN_LICENSE:LGPL$\n"
2741         "** GNU Lesser General Public License Usage\n"
2742         "** This file may be used under the terms of the GNU Lesser General Public\n"
2743         "** License version 2.1 as published by the Free Software Foundation and\n"
2744         "** appearing in the file LICENSE.LGPL included in the packaging of this\n"
2745         "** file. Please review the following information to ensure the GNU Lesser\n"
2746         "** General Public License version 2.1 requirements will be met:\n"
2747         "** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2748         "**\n"
2749         "** In addition, as a special exception, Nokia gives you certain additional\n"
2750         "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2751         "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2752         "**\n"
2753         "** GNU General Public License Usage\n"
2754         "** Alternatively, this file may be used under the terms of the GNU General\n"
2755         "** Public License version 3.0 as published by the Free Software Foundation\n"
2756         "** and appearing in the file LICENSE.GPL included in the packaging of this\n"
2757         "** file. Please review the following information to ensure the GNU General\n"
2758         "** Public License version 3.0 requirements will be met:\n"
2759         "** http://www.gnu.org/copyleft/gpl.html.\n"
2760         "**\n"
2761         "** Other Usage\n"
2762         "** Alternatively, this file may be used in accordance with the terms and\n"
2763         "** conditions contained in a signed written agreement between you and Nokia.\n"
2764         "**\n"
2765         "**\n"
2766         "**\n"
2767         "**\n"
2768         "**\n"
2769         "**\n"
2770         "** $QT_END_LICENSE$\n"
2771         "**\n"
2772         "****************************************************************************/\n\n";
2773
2774     QByteArray note =
2775         "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
2776
2777     QByteArray warning =
2778         "//\n"
2779         "//  W A R N I N G\n"
2780         "//  -------------\n"
2781         "//\n"
2782         "// This file is not part of the Qt API.  It exists for the convenience\n"
2783         "// of internal files.  This header file may change from version to version\n"
2784         "// without notice, or even be removed.\n"
2785         "//\n"
2786         "// We mean it.\n"
2787         "//\n\n";
2788
2789     QFile f("../../src/corelib/tools/qunicodetables.cpp");
2790     f.open(QFile::WriteOnly|QFile::Truncate);
2791     f.write(header);
2792     f.write(note);
2793     f.write("#include \"qunicodetables_p.h\"\n\n");
2794     f.write("QT_BEGIN_NAMESPACE\n\n");
2795     f.write("namespace QUnicodeTables {\n\n");
2796     f.write(properties);
2797     f.write("\n");
2798     f.write(specialCases);
2799     f.write("\n");
2800     f.write(compositions);
2801     f.write(ligatures);
2802     f.write("\n");
2803     f.write(normalizationCorrections);
2804     f.write("} // namespace QUnicodeTables\n\n");
2805     f.write("using namespace QUnicodeTables;\n\n");
2806     f.write("QT_END_NAMESPACE\n");
2807     f.close();
2808
2809     f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2810     f.open(QFile::WriteOnly | QFile::Truncate);
2811     f.write(header);
2812     f.write(note);
2813     f.write(warning);
2814     f.write("#ifndef QUNICODETABLES_P_H\n"
2815             "#define QUNICODETABLES_P_H\n\n"
2816             "#include <QtCore/qchar.h>\n\n"
2817             "QT_BEGIN_NAMESPACE\n\n");
2818     f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
2819     f.write("namespace QUnicodeTables {\n\n");
2820     f.write(property_string);
2821     f.write(scriptEnumDeclaration);
2822     f.write(grapheme_break_string);
2823     f.write(word_break_string);
2824     f.write(sentence_break_string);
2825     f.write(line_break_class_string);
2826     f.write(methods);
2827     f.write("} // namespace QUnicodeTables\n\n"
2828             "QT_END_NAMESPACE\n\n"
2829             "#endif // QUNICODETABLES_P_H\n");
2830     f.close();
2831
2832     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
2833     qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2834     qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2835     qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2836     qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
2837 #if 0
2838 //     dump(0, 0x7f);
2839 //     dump(0x620, 0x640);
2840 //     dump(0x10000, 0x10020);
2841 //     dump(0x10800, 0x10820);
2842
2843     qDebug("decompositionLength used:");
2844     int totalcompositions = 0;
2845     int sum = 0;
2846     for (int i = 1; i < 20; ++i) {
2847         qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
2848         totalcompositions += i*decompositionLength.value(i, 0);
2849         sum += decompositionLength.value(i, 0);
2850     }
2851     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
2852            totalcompositions, (float)totalcompositions/(float)sum, sum);
2853     qDebug("highest composed character %x", highestComposedCharacter);
2854     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2855
2856     qBubbleSort(ligatures);
2857     for (int i = 0; i < ligatures.size(); ++i)
2858         qDebug("%s", ligatures.at(i).data());
2859
2860 //     qDebug("combiningClass usage:");
2861 //     int numClasses = 0;
2862 //     for (int i = 0; i < 255; ++i) {
2863 //         int num = combiningClassUsage.value(i, 0);
2864 //         if (num) {
2865 //             ++numClasses;
2866 //             qDebug("    combiningClass %d used %d times", i, num);
2867 //         }
2868 //     }
2869 //     qDebug("total of %d combining classes used", numClasses);
2870
2871 #endif
2872 }