util/unicode/main.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the utils of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include <qlist.h>
  43 #include <qhash.h>
  44 #include <qfile.h>
  45 #include <qbytearray.h>
  46 #include <qstring.h>
  47 #include <qchar.h>
  48 #include <qvector.h>
  49 #include <qdebug.h>
  50 #if 0
  51 #include <private/qunicodetables_p.h>
  52 #endif
  53
  54 #define DATA_VERSION_S "5.0"
  55 #define DATA_VERSION_STR "QChar::Unicode_5_0"
  56
  57 #define LAST_CODEPOINT 0x10ffff
  58 #define LAST_CODEPOINT_STR "0x10ffff"
  59
  60
  61 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
  62
  63 static void initAgeMap()
  64 {
  65     struct AgeMap {
  66         const QChar::UnicodeVersion version;
  67         const char *age;
  68     } ageMap[] = {
  69         { QChar::Unicode_1_1,   "1.1" },
  70         { QChar::Unicode_2_0,   "2.0" },
  71         { QChar::Unicode_2_1_2, "2.1" },
  72         { QChar::Unicode_3_0,   "3.0" },
  73         { QChar::Unicode_3_1,   "3.1" },
  74         { QChar::Unicode_3_2,   "3.2" },
  75         { QChar::Unicode_4_0,   "4.0" },
  76         { QChar::Unicode_4_1,   "4.1" },
  77         { QChar::Unicode_5_0,   "5.0" },
  78         { QChar::Unicode_Unassigned, 0 }
  79     };
  80     AgeMap *d = ageMap;
  81     while (d->age) {
  82         age_map.insert(d->age, d->version);
  83         ++d;
  84     }
  85 }
  86
  87
  88 enum Joining {
  89     Joining_None,
  90     Joining_Left,
  91     Joining_Causing,
  92     Joining_Dual,
  93     Joining_Right,
  94     Joining_Transparent
  95
  96     , Joining_Unassigned
  97 };
  98
  99 static QHash<QByteArray, Joining> joining_map;
 100
 101 static void initJoiningMap()
 102 {
 103     struct JoiningList {
 104         Joining joining;
 105         const char *name;
 106     } joinings[] = {
 107         { Joining_None,        "U" },
 108         { Joining_Left,        "L" },
 109         { Joining_Causing,     "C" },
 110         { Joining_Dual,        "D" },
 111         { Joining_Right,       "R" },
 112         { Joining_Transparent, "T" },
 113         { Joining_Unassigned, 0 }
 114     };
 115     JoiningList *d = joinings;
 116     while (d->name) {
 117         joining_map.insert(d->name, d->joining);
 118         ++d;
 119     }
 120 }
 121
 122
 123 static const char *grapheme_break_string =
 124     "    enum GraphemeBreak {\n"
 125     "        GraphemeBreakOther,\n"
 126     "        GraphemeBreakCR,\n"
 127     "        GraphemeBreakLF,\n"
 128     "        GraphemeBreakControl,\n"
 129     "        GraphemeBreakExtend,\n"
 130     "        GraphemeBreakL,\n"
 131     "        GraphemeBreakV,\n"
 132     "        GraphemeBreakT,\n"
 133     "        GraphemeBreakLV,\n"
 134     "        GraphemeBreakLVT\n"
 135     "    };\n\n";
 136
 137 enum GraphemeBreak {
 138     GraphemeBreakOther,
 139     GraphemeBreakCR,
 140     GraphemeBreakLF,
 141     GraphemeBreakControl,
 142     GraphemeBreakExtend,
 143     GraphemeBreakL,
 144     GraphemeBreakV,
 145     GraphemeBreakT,
 146     GraphemeBreakLV,
 147     GraphemeBreakLVT
 148
 149     , GraphemeBreak_Unassigned
 150 };
 151
 152 static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
 153
 154 static void initGraphemeBreak()
 155 {
 156     struct GraphemeBreakList {
 157         GraphemeBreak brk;
 158         const char *name;
 159     } breaks[] = {
 160         { GraphemeBreakOther, "Other" },
 161         { GraphemeBreakCR, "CR" },
 162         { GraphemeBreakLF, "LF" },
 163         { GraphemeBreakControl, "Control" },
 164         { GraphemeBreakExtend, "Extend" },
 165         { GraphemeBreakL, "L" },
 166         { GraphemeBreakV, "V" },
 167         { GraphemeBreakT, "T" },
 168         { GraphemeBreakLV, "LV" },
 169         { GraphemeBreakLVT, "LVT" },
 170         { GraphemeBreak_Unassigned, 0 }
 171     };
 172     GraphemeBreakList *d = breaks;
 173     while (d->name) {
 174         grapheme_break_map.insert(d->name, d->brk);
 175         ++d;
 176     }
 177 }
 178
 179
 180 static const char *word_break_string =
 181     "    enum WordBreak {\n"
 182     "        WordBreakOther,\n"
 183     "        WordBreakFormat,\n"
 184     "        WordBreakKatakana,\n"
 185     "        WordBreakALetter,\n"
 186     "        WordBreakMidLetter,\n"
 187     "        WordBreakMidNum,\n"
 188     "        WordBreakNumeric,\n"
 189     "        WordBreakExtendNumLet\n"
 190     "    };\n\n";
 191
 192 enum WordBreak {
 193     WordBreakOther,
 194     WordBreakFormat,
 195     WordBreakKatakana,
 196     WordBreakALetter,
 197     WordBreakMidLetter,
 198     WordBreakMidNum,
 199     WordBreakNumeric,
 200     WordBreakExtendNumLet
 201
 202     , WordBreak_Unassigned
 203 };
 204
 205 static QHash<QByteArray, WordBreak> word_break_map;
 206
 207 static void initWordBreak()
 208 {
 209     struct WordBreakList {
 210         WordBreak brk;
 211         const char *name;
 212     } breaks[] = {
 213         { WordBreakFormat, "Format" },
 214         { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
 215         { WordBreakKatakana, "Katakana" },
 216         { WordBreakALetter, "ALetter" },
 217         { WordBreakMidLetter, "MidLetter" },
 218         { WordBreakMidNum, "MidNum" },
 219         { WordBreakNumeric, "Numeric" },
 220         { WordBreakExtendNumLet, "ExtendNumLet" },
 221         { WordBreak_Unassigned, 0 }
 222     };
 223     WordBreakList *d = breaks;
 224     while (d->name) {
 225         word_break_map.insert(d->name, d->brk);
 226         ++d;
 227     }
 228 }
 229
 230
 231 static const char *sentence_break_string =
 232     "    enum SentenceBreak {\n"
 233     "        SentenceBreakOther,\n"
 234     "        SentenceBreakSep,\n"
 235     "        SentenceBreakFormat,\n"
 236     "        SentenceBreakSp,\n"
 237     "        SentenceBreakLower,\n"
 238     "        SentenceBreakUpper,\n"
 239     "        SentenceBreakOLetter,\n"
 240     "        SentenceBreakNumeric,\n"
 241     "        SentenceBreakATerm,\n"
 242     "        SentenceBreakSTerm,\n"
 243     "        SentenceBreakClose\n"
 244     "    };\n\n";
 245
 246 enum SentenceBreak {
 247     SentenceBreakOther,
 248     SentenceBreakSep,
 249     SentenceBreakFormat,
 250     SentenceBreakSp,
 251     SentenceBreakLower,
 252     SentenceBreakUpper,
 253     SentenceBreakOLetter,
 254     SentenceBreakNumeric,
 255     SentenceBreakATerm,
 256     SentenceBreakSTerm,
 257     SentenceBreakClose
 258
 259     , SentenceBreak_Unassigned
 260 };
 261
 262 static QHash<QByteArray, SentenceBreak> sentence_break_map;
 263
 264 static void initSentenceBreak()
 265 {
 266     struct SentenceBreakList {
 267         SentenceBreak brk;
 268         const char *name;
 269     } breaks[] = {
 270         { SentenceBreakOther, "Other" },
 271         { SentenceBreakSep, "Sep" },
 272         { SentenceBreakFormat, "Format" },
 273         { SentenceBreakSp, "Sp" },
 274         { SentenceBreakLower, "Lower" },
 275         { SentenceBreakUpper, "Upper" },
 276         { SentenceBreakOLetter, "OLetter" },
 277         { SentenceBreakNumeric, "Numeric" },
 278         { SentenceBreakATerm, "ATerm" },
 279         { SentenceBreakSTerm, "STerm" },
 280         { SentenceBreakClose, "Close" },
 281         { SentenceBreak_Unassigned, 0 }
 282     };
 283     SentenceBreakList *d = breaks;
 284     while (d->name) {
 285         sentence_break_map.insert(d->name, d->brk);
 286         ++d;
 287     }
 288 }
 289
 290
 291 static const char *line_break_class_string =
 292     "    // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
 293     "    // we don't use the XX, AI and CB properties and map them to AL instead.\n"
 294     "    // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
 295     "    enum LineBreakClass {\n"
 296     "        LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
 297     "        LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
 298     "        LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
 299     "        LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
 300     "        LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
 301     "        LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
 302     "        LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
 303     "    };\n\n";
 304
 305 enum LineBreakClass {
 306     LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,
 307     LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,
 308     LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,
 309     LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,
 310     LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,
 311     LineBreak_JT, LineBreak_SA, LineBreak_SG,
 312     LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
 313
 314     , LineBreak_Unassigned
 315 };
 316
 317 static QHash<QByteArray, LineBreakClass> line_break_map;
 318
 319 static void initLineBreak()
 320 {
 321     // ### Classes XX and AI are left out and mapped to AL for now;
 322     // ### Class NL is ignored and mapped to AL as well.
 323     struct LineBreakList {
 324         LineBreakClass brk;
 325         const char *name;
 326     } breaks[] = {
 327         { LineBreak_BK, "BK" },
 328         { LineBreak_CR, "CR" },
 329         { LineBreak_LF, "LF" },
 330         { LineBreak_CM, "CM" },
 331         { LineBreak_AL, "NL" },
 332         { LineBreak_SG, "SG" },
 333         { LineBreak_WJ, "WJ" },
 334         { LineBreak_ZW, "ZW" },
 335         { LineBreak_GL, "GL" },
 336         { LineBreak_SP, "SP" },
 337         { LineBreak_B2, "B2" },
 338         { LineBreak_BA, "BA" },
 339         { LineBreak_BB, "BB" },
 340         { LineBreak_HY, "HY" },
 341         { LineBreak_AL, "CB" }, // ###
 342         { LineBreak_CL, "CL" },
 343         { LineBreak_EX, "EX" },
 344         { LineBreak_IN, "IN" },
 345         { LineBreak_NS, "NS" },
 346         { LineBreak_OP, "OP" },
 347         { LineBreak_QU, "QU" },
 348         { LineBreak_IS, "IS" },
 349         { LineBreak_NU, "NU" },
 350         { LineBreak_PO, "PO" },
 351         { LineBreak_PR, "PR" },
 352         { LineBreak_SY, "SY" },
 353         { LineBreak_AL, "AI" },
 354         { LineBreak_AL, "AL" },
 355         { LineBreak_H2, "H2" },
 356         { LineBreak_H3, "H3" },
 357         { LineBreak_ID, "ID" },
 358         { LineBreak_JL, "JL" },
 359         { LineBreak_JV, "JV" },
 360         { LineBreak_JT, "JT" },
 361         { LineBreak_SA, "SA" },
 362         { LineBreak_AL, "XX" },
 363         { LineBreak_Unassigned, 0 }
 364     };
 365     LineBreakList *d = breaks;
 366     while (d->name) {
 367         line_break_map.insert(d->name, d->brk);
 368         ++d;
 369     }
 370 }
 371
 372
 373 // Keep this one in sync with the code in createPropertyInfo
 374 static const char *property_string =
 375     "    struct Properties {\n"
 376     "        ushort category         : 8; /* 5 needed */\n"
 377     "        ushort line_break_class : 8; /* 6 needed */\n"
 378     "        ushort direction        : 8; /* 5 needed */\n"
 379     "        ushort combiningClass   : 8;\n"
 380     "        ushort joining          : 2;\n"
 381     "        signed short digitValue : 6; /* 5 needed */\n"
 382     "        ushort unicodeVersion   : 4;\n"
 383     "        ushort lowerCaseSpecial : 1;\n"
 384     "        ushort upperCaseSpecial : 1;\n"
 385     "        ushort titleCaseSpecial : 1;\n"
 386     "        ushort caseFoldSpecial  : 1;\n"
 387     "        signed short mirrorDiff    : 16;\n"
 388     "        signed short lowerCaseDiff : 16;\n"
 389     "        signed short upperCaseDiff : 16;\n"
 390     "        signed short titleCaseDiff : 16;\n"
 391     "        signed short caseFoldDiff  : 16;\n"
 392     "        ushort graphemeBreak    : 8; /* 4 needed */\n"
 393     "        ushort wordBreak        : 8; /* 4 needed */\n"
 394     "        ushort sentenceBreak    : 8; /* 4 needed */\n"
 395     "    };\n"
 396     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
 397     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
 398
 399 static const char *methods =
 400     "    Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
 401     "    inline int lineBreakClass(QChar ch)\n"
 402     "    { return lineBreakClass(ch.unicode()); }\n"
 403     "\n"
 404     "    Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
 405     "    inline int script(QChar ch)\n"
 406     "    { return script(ch.unicode()); }\n\n";
 407
 408 static const int SizeOfPropertiesStruct = 20;
 409
 410 struct PropertyFlags {
 411     bool operator ==(const PropertyFlags &o) {
 412         return (combiningClass == o.combiningClass
 413                 && category == o.category
 414                 && direction == o.direction
 415                 && joining == o.joining
 416                 && age == o.age
 417                 && digitValue == o.digitValue
 418                 && line_break_class == o.line_break_class
 419                 && mirrorDiff == o.mirrorDiff
 420                 && lowerCaseDiff == o.lowerCaseDiff
 421                 && upperCaseDiff == o.upperCaseDiff
 422                 && titleCaseDiff == o.titleCaseDiff
 423                 && caseFoldDiff == o.caseFoldDiff
 424                 && lowerCaseSpecial == o.lowerCaseSpecial
 425                 && upperCaseSpecial == o.upperCaseSpecial
 426                 && titleCaseSpecial == o.titleCaseSpecial
 427                 && caseFoldSpecial == o.caseFoldSpecial
 428                 && graphemeBreak == o.graphemeBreak
 429                 && wordBreak == o.wordBreak
 430                 && sentenceBreak == o.sentenceBreak
 431             );
 432     }
 433     // from UnicodeData.txt
 434     uchar combiningClass : 8;
 435     QChar::Category category : 5;
 436     QChar::Direction direction : 5;
 437     // from ArabicShaping.txt
 438     QChar::Joining joining : 2;
 439     // from DerivedAge.txt
 440     QChar::UnicodeVersion age : 4;
 441     int digitValue;
 442     LineBreakClass line_break_class;
 443
 444     int mirrorDiff : 16;
 445
 446     int lowerCaseDiff;
 447     int upperCaseDiff;
 448     int titleCaseDiff;
 449     int caseFoldDiff;
 450     bool lowerCaseSpecial;
 451     bool upperCaseSpecial;
 452     bool titleCaseSpecial;
 453     bool caseFoldSpecial;
 454     GraphemeBreak graphemeBreak;
 455     WordBreak wordBreak;
 456     SentenceBreak sentenceBreak;
 457 };
 458
 459
 460 static QList<int> specialCaseMap;
 461
 462 static int appendToSpecialCaseMap(const QList<int> &map)
 463 {
 464     QList<int> utf16map;
 465     for (int i = 0; i < map.size(); ++i) {
 466         int val = map.at(i);
 467         if (QChar::requiresSurrogates(val)) {
 468             utf16map << QChar::highSurrogate(val);
 469             utf16map << QChar::lowSurrogate(val);
 470         } else {
 471             utf16map << val;
 472         }
 473     }
 474     int length = utf16map.size();
 475     utf16map.prepend(length);
 476
 477     if (specialCaseMap.isEmpty())
 478         specialCaseMap << 0; // placeholder
 479
 480     int i = 1;
 481     while (i < specialCaseMap.size()) {
 482         int n = specialCaseMap.at(i);
 483         if (n == length) {
 484             int j;
 485             for (j = 1; j <= n; ++j) {
 486                 if (specialCaseMap.at(i+j) != utf16map.at(j))
 487                     break;
 488             }
 489             if (j > n)
 490                 return i;
 491         }
 492         i += n + 1;
 493     }
 494
 495     int pos = specialCaseMap.size();
 496     specialCaseMap << utf16map;
 497     return pos;
 498 }
 499
 500 struct UnicodeData {
 501     UnicodeData(int codepoint = 0) {
 502         p.category = QChar::Other_NotAssigned; // Cn
 503         p.combiningClass = 0;
 504
 505         p.direction = QChar::DirL;
 506         // DerivedBidiClass.txt
 507         // DirR for:  U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
 508         if ((codepoint >= 0x590 && codepoint <= 0x5ff)
 509             || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
 510             || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
 511             || (codepoint >= 0x10800 && codepoint <= 0x10fff)) {
 512             p.direction = QChar::DirR;
 513         }
 514         // DirAL for:  U+0600..U+07BF, U+FB50..U+FDFF, U+FE70..U+FEFF
 515         //             minus noncharacter code points (intersects with U+FDD0..U+FDEF)
 516         if ((codepoint >= 0x600 && codepoint <= 0x7bf)
 517             || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
 518             || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
 519             || (codepoint >= 0xfe70 && codepoint <= 0xfeff)) {
 520             p.direction = QChar::DirAL;
 521         }
 522
 523         mirroredChar = 0;
 524         decompositionType = QChar::NoDecomposition;
 525         p.joining = QChar::OtherJoining;
 526         p.age = QChar::Unicode_Unassigned;
 527         p.mirrorDiff = 0;
 528         p.digitValue = -1;
 529         p.line_break_class = LineBreak_AL; // XX -> AL
 530         p.lowerCaseDiff = 0;
 531         p.upperCaseDiff = 0;
 532         p.titleCaseDiff = 0;
 533         p.caseFoldDiff = 0;
 534         p.lowerCaseSpecial = 0;
 535         p.upperCaseSpecial = 0;
 536         p.titleCaseSpecial = 0;
 537         p.caseFoldSpecial = 0;
 538         p.graphemeBreak = GraphemeBreakOther;
 539         p.wordBreak = WordBreakOther;
 540         p.sentenceBreak = SentenceBreakOther;
 541         propertyIndex = -1;
 542         excludedComposition = false;
 543     }
 544     PropertyFlags p;
 545
 546     // from UnicodeData.txt
 547     QChar::Decomposition decompositionType;
 548     QList<int> decomposition;
 549
 550     QList<int> specialFolding;
 551
 552     // from BidiMirroring.txt
 553     int mirroredChar;
 554
 555     // DerivedNormalizationProps.txt
 556     bool excludedComposition;
 557
 558     // computed position of unicode property set
 559     int propertyIndex;
 560 };
 561
 562 enum UniDataFields {
 563     UD_Value,
 564     UD_Name,
 565     UD_Category,
 566     UD_CombiningClass,
 567     UD_BidiCategory,
 568     UD_Decomposition,
 569     UD_DecimalDigitValue,
 570     UD_DigitValue,
 571     UD_NumericValue,
 572     UD_Mirrored,
 573     UD_OldName,
 574     UD_Comment,
 575     UD_UpperCase,
 576     UD_LowerCase,
 577     UD_TitleCase
 578 };
 579
 580
 581 static QHash<QByteArray, QChar::Category> categoryMap;
 582
 583 static void initCategoryMap()
 584 {
 585     struct Cat {
 586         QChar::Category cat;
 587         const char *name;
 588     } categories[] = {
 589         { QChar::Mark_NonSpacing,          "Mn" },
 590         { QChar::Mark_SpacingCombining,    "Mc" },
 591         { QChar::Mark_Enclosing,           "Me" },
 592
 593         { QChar::Number_DecimalDigit,      "Nd" },
 594         { QChar::Number_Letter,            "Nl" },
 595         { QChar::Number_Other,             "No" },
 596
 597         { QChar::Separator_Space,          "Zs" },
 598         { QChar::Separator_Line,           "Zl" },
 599         { QChar::Separator_Paragraph,      "Zp" },
 600
 601         { QChar::Other_Control,            "Cc" },
 602         { QChar::Other_Format,             "Cf" },
 603         { QChar::Other_Surrogate,          "Cs" },
 604         { QChar::Other_PrivateUse,         "Co" },
 605         { QChar::Other_NotAssigned,        "Cn" },
 606
 607         { QChar::Letter_Uppercase,         "Lu" },
 608         { QChar::Letter_Lowercase,         "Ll" },
 609         { QChar::Letter_Titlecase,         "Lt" },
 610         { QChar::Letter_Modifier,          "Lm" },
 611         { QChar::Letter_Other,             "Lo" },
 612
 613         { QChar::Punctuation_Connector,    "Pc" },
 614         { QChar::Punctuation_Dash,         "Pd" },
 615         { QChar::Punctuation_Open,         "Ps" },
 616         { QChar::Punctuation_Close,        "Pe" },
 617         { QChar::Punctuation_InitialQuote, "Pi" },
 618         { QChar::Punctuation_FinalQuote,   "Pf" },
 619         { QChar::Punctuation_Other,        "Po" },
 620
 621         { QChar::Symbol_Math,              "Sm" },
 622         { QChar::Symbol_Currency,          "Sc" },
 623         { QChar::Symbol_Modifier,          "Sk" },
 624         { QChar::Symbol_Other,             "So" },
 625         { QChar::Other_NotAssigned, 0 }
 626     };
 627     Cat *c = categories;
 628     while (c->name) {
 629         categoryMap.insert(c->name, c->cat);
 630         ++c;
 631     }
 632 }
 633
 634
 635 static QHash<QByteArray, QChar::Direction> directionMap;
 636
 637 static void initDirectionMap()
 638 {
 639     struct Dir {
 640         QChar::Direction dir;
 641         const char *name;
 642     } directions[] = {
 643         { QChar::DirL, "L" },
 644         { QChar::DirR, "R" },
 645         { QChar::DirEN, "EN" },
 646         { QChar::DirES, "ES" },
 647         { QChar::DirET, "ET" },
 648         { QChar::DirAN, "AN" },
 649         { QChar::DirCS, "CS" },
 650         { QChar::DirB, "B" },
 651         { QChar::DirS, "S" },
 652         { QChar::DirWS, "WS" },
 653         { QChar::DirON, "ON" },
 654         { QChar::DirLRE, "LRE" },
 655         { QChar::DirLRO, "LRO" },
 656         { QChar::DirAL, "AL" },
 657         { QChar::DirRLE, "RLE" },
 658         { QChar::DirRLO, "RLO" },
 659         { QChar::DirPDF, "PDF" },
 660         { QChar::DirNSM, "NSM" },
 661         { QChar::DirBN, "BN" },
 662         { QChar::DirL, 0 }
 663     };
 664     Dir *d = directions;
 665     while (d->name) {
 666         directionMap.insert(d->name, d->dir);
 667         ++d;
 668     }
 669 }
 670
 671
 672 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
 673
 674 static void initDecompositionMap()
 675 {
 676     struct Dec {
 677         QChar::Decomposition dec;
 678         const char *name;
 679     } decompositions[] = {
 680         { QChar::Canonical, "<canonical>" },
 681         { QChar::Font, "<font>" },
 682         { QChar::NoBreak, "<noBreak>" },
 683         { QChar::Initial, "<initial>" },
 684         { QChar::Medial, "<medial>" },
 685         { QChar::Final, "<final>" },
 686         { QChar::Isolated, "<isolated>" },
 687         { QChar::Circle, "<circle>" },
 688         { QChar::Super, "<super>" },
 689         { QChar::Sub, "<sub>" },
 690         { QChar::Vertical, "<vertical>" },
 691         { QChar::Wide, "<wide>" },
 692         { QChar::Narrow, "<narrow>" },
 693         { QChar::Small, "<small>" },
 694         { QChar::Square, "<square>" },
 695         { QChar::Compat, "<compat>" },
 696         { QChar::Fraction, "<fraction>" },
 697         { QChar::NoDecomposition, 0 }
 698     };
 699     Dec *d = decompositions;
 700     while (d->name) {
 701         decompositionMap.insert(d->name, d->dec);
 702         ++d;
 703     }
 704 }
 705
 706
 707 static QHash<int, UnicodeData> unicodeData;
 708 static QList<PropertyFlags> uniqueProperties;
 709
 710
 711 static QHash<int, int> decompositionLength;
 712 static int highestComposedCharacter = 0;
 713 static int numLigatures = 0;
 714 static int highestLigature = 0;
 715
 716 struct Ligature {
 717     int u1;
 718     int u2;
 719     int ligature;
 720 };
 721 // we need them sorted after the first component for fast lookup
 722 bool operator < (const Ligature &l1, const Ligature &l2)
 723 { return l1.u1 < l2.u1; }
 724
 725 static QHash<int, QList<Ligature> > ligatureHashes;
 726
 727 static QHash<int, int> combiningClassUsage;
 728
 729 static int maxLowerCaseDiff = 0;
 730 static int maxUpperCaseDiff = 0;
 731 static int maxTitleCaseDiff = 0;
 732
 733 static void readUnicodeData()
 734 {
 735     qDebug() << "Reading UnicodeData.txt";
 736     QFile f("data/UnicodeData.txt");
 737     if (!f.exists())
 738         qFatal("Couldn't find UnicodeData.txt");
 739
 740     f.open(QFile::ReadOnly);
 741
 742     while (!f.atEnd()) {
 743         QByteArray line;
 744         line.resize(1024);
 745         int len = f.readLine(line.data(), 1024);
 746         line.truncate(len-1);
 747
 748         int comment = line.indexOf('#');
 749         if (comment >= 0)
 750             line = line.left(comment);
 751         if (line.isEmpty())
 752             continue;
 753
 754         QList<QByteArray> properties = line.split(';');
 755         bool ok;
 756         int codepoint = properties[UD_Value].toInt(&ok, 16);
 757         Q_ASSERT(ok);
 758         Q_ASSERT(codepoint <= LAST_CODEPOINT);
 759         int lastCodepoint = codepoint;
 760
 761         QByteArray name = properties[UD_Name];
 762         if (name.startsWith('<') && name.contains("First")) {
 763             QByteArray nextLine;
 764             nextLine.resize(1024);
 765             f.readLine(nextLine.data(), 1024);
 766             QList<QByteArray> properties = nextLine.split(';');
 767             Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
 768             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
 769             Q_ASSERT(ok);
 770             Q_ASSERT(lastCodepoint <= LAST_CODEPOINT);
 771         }
 772
 773         UnicodeData data(codepoint);
 774         data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
 775         data.p.combiningClass = properties[UD_CombiningClass].toInt();
 776         if (!combiningClassUsage.contains(data.p.combiningClass))
 777             combiningClassUsage[data.p.combiningClass] = 1;
 778         else
 779             ++combiningClassUsage[data.p.combiningClass];
 780
 781         data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
 782
 783         if (!properties[UD_UpperCase].isEmpty()) {
 784             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
 785             Q_ASSERT(ok);
 786             int diff = upperCase - codepoint;
 787             if (qAbs(diff) >= (1<<14)) {
 788                 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
 789                 // if the condition below doesn't hold anymore we need to modify our special upper casing code in qchar.cpp
 790                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(upperCase));
 791                 data.p.upperCaseSpecial = true;
 792                 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
 793             } else {
 794                 data.p.upperCaseDiff = diff;
 795                 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
 796             }
 797             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(upperCase)) {
 798                 // if the conditions below doesn't hold anymore we need to modify our upper casing code
 799                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
 800                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
 801             }
 802         }
 803         if (!properties[UD_LowerCase].isEmpty()) {
 804             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
 805             Q_ASSERT(ok);
 806             int diff = lowerCase - codepoint;
 807             if (qAbs(diff) >= (1<<14)) {
 808                 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
 809                 // if the condition below doesn't hold anymore we need to modify our special lower casing code in qchar.cpp
 810                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(lowerCase));
 811                 data.p.lowerCaseSpecial = true;
 812                 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
 813             } else {
 814                 data.p.lowerCaseDiff = diff;
 815                 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
 816             }
 817             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(lowerCase)) {
 818                 // if the conditions below doesn't hold anymore we need to modify our lower casing code
 819                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
 820                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
 821             }
 822         }
 823         // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
 824         if (properties[UD_TitleCase].isEmpty())
 825             properties[UD_TitleCase] = properties[UD_UpperCase];
 826         if (!properties[UD_TitleCase].isEmpty()) {
 827             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
 828             Q_ASSERT(ok);
 829             int diff = titleCase - codepoint;
 830             if (qAbs(diff) >= (1<<14)) {
 831                 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
 832                 // if the condition below doesn't hold anymore we need to modify our special title casing code in qchar.cpp
 833                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(titleCase));
 834                 data.p.titleCaseSpecial = true;
 835                 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
 836             } else {
 837                 data.p.titleCaseDiff = diff;
 838                 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
 839             }
 840             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(titleCase)) {
 841                 // if the conditions below doesn't hold anymore we need to modify our title casing code
 842                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
 843                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
 844             }
 845         }
 846
 847         if (!properties[UD_DigitValue].isEmpty())
 848             data.p.digitValue = properties[UD_DigitValue].toInt();
 849
 850         // decompositition
 851         QByteArray decomposition = properties[UD_Decomposition];
 852         if (!decomposition.isEmpty()) {
 853             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
 854             QList<QByteArray> d = decomposition.split(' ');
 855             if (d[0].contains('<')) {
 856                 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
 857                 if (data.decompositionType == QChar::NoDecomposition)
 858                     qFatal("unassigned decomposition type: %s", d[0].constData());
 859                 d.takeFirst();
 860             } else {
 861                 data.decompositionType = QChar::Canonical;
 862             }
 863             for (int i = 0; i < d.size(); ++i) {
 864                 data.decomposition.append(d[i].toInt(&ok, 16));
 865                 Q_ASSERT(ok);
 866             }
 867             ++decompositionLength[data.decomposition.size()];
 868         }
 869
 870         for (int i = codepoint; i <= lastCodepoint; ++i)
 871             unicodeData.insert(i, data);
 872     }
 873
 874 }
 875
 876 static int maxMirroredDiff = 0;
 877
 878 static void readBidiMirroring()
 879 {
 880     qDebug() << "Reading BidiMirroring.txt";
 881     QFile f("data/BidiMirroring.txt");
 882     if (!f.exists())
 883         qFatal("Couldn't find BidiMirroring.txt");
 884
 885     f.open(QFile::ReadOnly);
 886
 887     while (!f.atEnd()) {
 888         QByteArray line;
 889         line.resize(1024);
 890         int len = f.readLine(line.data(), 1024);
 891         line.resize(len-1);
 892
 893         int comment = line.indexOf('#');
 894         if (comment >= 0)
 895             line = line.left(comment);
 896
 897         if (line.isEmpty())
 898             continue;
 899         line = line.replace(" ", "");
 900
 901         QList<QByteArray> pair = line.split(';');
 902         Q_ASSERT(pair.size() == 2);
 903
 904         bool ok;
 905         int codepoint = pair[0].toInt(&ok, 16);
 906         Q_ASSERT(ok);
 907         int mirror = pair[1].toInt(&ok, 16);
 908         Q_ASSERT(ok);
 909
 910         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 911         d.mirroredChar = mirror;
 912         d.p.mirrorDiff = d.mirroredChar - codepoint;
 913         maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
 914         unicodeData.insert(codepoint, d);
 915     }
 916 }
 917
 918 static void readArabicShaping()
 919 {
 920     qDebug() << "Reading ArabicShaping.txt";
 921     QFile f("data/ArabicShaping.txt");
 922     if (!f.exists())
 923         qFatal("Couldn't find ArabicShaping.txt");
 924
 925     f.open(QFile::ReadOnly);
 926
 927     while (!f.atEnd()) {
 928         QByteArray line;
 929         line.resize(1024);
 930         int len = f.readLine(line.data(), 1024);
 931         line.resize(len-1);
 932
 933         int comment = line.indexOf('#');
 934         if (comment >= 0)
 935             line = line.left(comment);
 936         line = line.trimmed();
 937
 938         if (line.isEmpty())
 939             continue;
 940
 941         QList<QByteArray> l = line.split(';');
 942         Q_ASSERT(l.size() == 4);
 943
 944         bool ok;
 945         int codepoint = l[0].toInt(&ok, 16);
 946         Q_ASSERT(ok);
 947
 948         Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
 949         if (joining == Joining_Unassigned)
 950             qFatal("unassigned or unhandled joining value: %s", l[2].constData());
 951
 952         if (joining == Joining_Left) {
 953             // There are currently no characters of joining type Left_Joining defined in Unicode.
 954             qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
 955         }
 956
 957         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 958         if (joining == Joining_Right)
 959             d.p.joining = QChar::Right;
 960         else if (joining == Joining_Dual)
 961             d.p.joining = QChar::Dual;
 962         else if (joining == Joining_Causing)
 963             d.p.joining = QChar::Center;
 964         else
 965             d.p.joining = QChar::OtherJoining;
 966         unicodeData.insert(codepoint, d);
 967     }
 968 }
 969
 970 static void readDerivedAge()
 971 {
 972     qDebug() << "Reading DerivedAge.txt";
 973     QFile f("data/DerivedAge.txt");
 974     if (!f.exists())
 975         qFatal("Couldn't find DerivedAge.txt");
 976
 977     f.open(QFile::ReadOnly);
 978
 979     while (!f.atEnd()) {
 980         QByteArray line;
 981         line.resize(1024);
 982         int len = f.readLine(line.data(), 1024);
 983         line.resize(len-1);
 984
 985         int comment = line.indexOf('#');
 986         if (comment >= 0)
 987             line = line.left(comment);
 988         line.replace(" ", "");
 989
 990         if (line.isEmpty())
 991             continue;
 992
 993         QList<QByteArray> l = line.split(';');
 994         Q_ASSERT(l.size() == 2);
 995
 996         QByteArray codes = l[0];
 997         codes.replace("..", ".");
 998         QList<QByteArray> cl = codes.split('.');
 999
1000         bool ok;
1001         int from = cl[0].toInt(&ok, 16);
1002         Q_ASSERT(ok);
1003         int to = from;
1004         if (cl.size() == 2) {
1005             to = cl[1].toInt(&ok, 16);
1006             Q_ASSERT(ok);
1007         }
1008
1009         QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1010         //qDebug() << hex << from << ".." << to << ba << age;
1011         if (age == QChar::Unicode_Unassigned)
1012             qFatal("unassigned or unhandled age value: %s", l[1].constData());
1013
1014         for (int codepoint = from; codepoint <= to; ++codepoint) {
1015             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1016             d.p.age = age;
1017             unicodeData.insert(codepoint, d);
1018         }
1019     }
1020 }
1021
1022
1023 static void readDerivedNormalizationProps()
1024 {
1025     qDebug() << "Reading DerivedNormalizationProps.txt";
1026     QFile f("data/DerivedNormalizationProps.txt");
1027     if (!f.exists())
1028         qFatal("Couldn't find DerivedNormalizationProps.txt");
1029
1030     f.open(QFile::ReadOnly);
1031
1032     while (!f.atEnd()) {
1033         QByteArray line;
1034         line.resize(1024);
1035         int len = f.readLine(line.data(), 1024);
1036         line.resize(len-1);
1037
1038         int comment = line.indexOf('#');
1039         if (comment >= 0)
1040             line = line.left(comment);
1041
1042         if (line.trimmed().isEmpty())
1043             continue;
1044
1045         QList<QByteArray> l = line.split(';');
1046         Q_ASSERT(l.size() >= 2);
1047
1048         QByteArray propName = l[1].trimmed();
1049         if (propName != "Full_Composition_Exclusion")
1050             // ###
1051             continue;
1052
1053         QByteArray codes = l[0].trimmed();
1054         codes.replace("..", ".");
1055         QList<QByteArray> cl = codes.split('.');
1056
1057         bool ok;
1058         int from = cl[0].toInt(&ok, 16);
1059         Q_ASSERT(ok);
1060         int to = from;
1061         if (cl.size() == 2) {
1062             to = cl[1].toInt(&ok, 16);
1063             Q_ASSERT(ok);
1064         }
1065
1066         for (int codepoint = from; codepoint <= to; ++codepoint) {
1067             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1068             d.excludedComposition = true;
1069             unicodeData.insert(codepoint, d);
1070         }
1071     }
1072
1073     for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) {
1074         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1075         if (!d.excludedComposition
1076             && d.decompositionType == QChar::Canonical
1077             && d.decomposition.size() > 1) {
1078             Q_ASSERT(d.decomposition.size() == 2);
1079
1080             int part1 = d.decomposition.at(0);
1081             int part2 = d.decomposition.at(1);
1082
1083             // all non-starters are listed in DerivedNormalizationProps.txt
1084             // and already excluded from composition
1085             Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0);
1086
1087             ++numLigatures;
1088             highestLigature = qMax(highestLigature, part1);
1089             Ligature l = { part1, part2, codepoint };
1090             ligatureHashes[part2].append(l);
1091         }
1092     }
1093 }
1094
1095
1096 struct NormalizationCorrection {
1097     uint codepoint;
1098     uint mapped;
1099     int version;
1100 };
1101
1102 static QByteArray createNormalizationCorrections()
1103 {
1104     qDebug() << "Reading NormalizationCorrections.txt";
1105     QFile f("data/NormalizationCorrections.txt");
1106     if (!f.exists())
1107         qFatal("Couldn't find NormalizationCorrections.txt");
1108
1109     f.open(QFile::ReadOnly);
1110
1111     QByteArray out;
1112
1113     out += "struct NormalizationCorrection {\n"
1114            "    uint ucs4;\n"
1115            "    uint old_mapping;\n"
1116            "    int version;\n"
1117            "};\n\n"
1118
1119            "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1120
1121     int maxVersion = 0;
1122     int numCorrections = 0;
1123     while (!f.atEnd()) {
1124         QByteArray line;
1125         line.resize(1024);
1126         int len = f.readLine(line.data(), 1024);
1127         line.resize(len-1);
1128
1129         int comment = line.indexOf('#');
1130         if (comment >= 0)
1131             line = line.left(comment);
1132         line.replace(" ", "");
1133
1134         if (line.isEmpty())
1135             continue;
1136
1137         Q_ASSERT(!line.contains(".."));
1138
1139         QList<QByteArray> fields = line.split(';');
1140         Q_ASSERT(fields.size() == 4);
1141
1142         NormalizationCorrection c = { 0, 0, 0 };
1143         bool ok;
1144         c.codepoint = fields.at(0).toInt(&ok, 16);
1145         Q_ASSERT(ok);
1146         c.mapped = fields.at(1).toInt(&ok, 16);
1147         Q_ASSERT(ok);
1148         if (fields.at(3) == "3.2.0")
1149             c.version = QChar::Unicode_3_2;
1150         else if (fields.at(3) == "4.0.0")
1151             c.version = QChar::Unicode_4_0;
1152         else
1153             qFatal("unknown unicode version in NormalizationCorrection.txt");
1154
1155         out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1156              + ", " + QString::number(c.version) + " },\n";
1157         ++numCorrections;
1158         maxVersion = qMax(c.version, maxVersion);
1159     }
1160
1161     out += "};\n\n"
1162
1163            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1164            "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1165
1166     return out;
1167 }
1168
1169
1170 static void computeUniqueProperties()
1171 {
1172     qDebug("computeUniqueProperties:");
1173     for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) {
1174         UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1175
1176         int index = uniqueProperties.indexOf(d.p);
1177         if (index == -1) {
1178             index = uniqueProperties.size();
1179             uniqueProperties.append(d.p);
1180         }
1181         d.propertyIndex = index;
1182         unicodeData.insert(uc, d);
1183     }
1184     qDebug("    %d unique unicode properties found", uniqueProperties.size());
1185 }
1186
1187
1188 static void readLineBreak()
1189 {
1190     qDebug() << "Reading LineBreak.txt";
1191     QFile f("data/LineBreak.txt");
1192     if (!f.exists())
1193         qFatal("Couldn't find LineBreak.txt");
1194
1195     f.open(QFile::ReadOnly);
1196
1197     while (!f.atEnd()) {
1198         QByteArray line;
1199         line.resize(1024);
1200         int len = f.readLine(line.data(), 1024);
1201         line.resize(len-1);
1202
1203         int comment = line.indexOf('#');
1204         if (comment >= 0)
1205             line = line.left(comment);
1206         line.replace(" ", "");
1207
1208         if (line.isEmpty())
1209             continue;
1210
1211         QList<QByteArray> l = line.split(';');
1212         Q_ASSERT(l.size() == 2);
1213
1214         QByteArray codes = l[0];
1215         codes.replace("..", ".");
1216         QList<QByteArray> cl = codes.split('.');
1217
1218         bool ok;
1219         int from = cl[0].toInt(&ok, 16);
1220         Q_ASSERT(ok);
1221         int to = from;
1222         if (cl.size() == 2) {
1223             to = cl[1].toInt(&ok, 16);
1224             Q_ASSERT(ok);
1225         }
1226
1227         LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1228         if (lb == LineBreak_Unassigned)
1229             qFatal("unassigned line break class: %s", l[1].constData());
1230
1231         for (int codepoint = from; codepoint <= to; ++codepoint) {
1232             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1233             d.p.line_break_class = lb;
1234             unicodeData.insert(codepoint, d);
1235         }
1236     }
1237 }
1238
1239
1240 static void readSpecialCasing()
1241 {
1242     qDebug() << "Reading SpecialCasing.txt";
1243     QFile f("data/SpecialCasing.txt");
1244     if (!f.exists())
1245         qFatal("Couldn't find SpecialCasing.txt");
1246
1247     f.open(QFile::ReadOnly);
1248
1249     while (!f.atEnd()) {
1250         QByteArray line;
1251         line.resize(1024);
1252         int len = f.readLine(line.data(), 1024);
1253         line.resize(len-1);
1254
1255         int comment = line.indexOf('#');
1256         if (comment >= 0)
1257             line = line.left(comment);
1258
1259         if (line.isEmpty())
1260             continue;
1261
1262         QList<QByteArray> l = line.split(';');
1263
1264         QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1265         if (!condition.isEmpty())
1266             // #####
1267             continue;
1268
1269         bool ok;
1270         int codepoint = l[0].trimmed().toInt(&ok, 16);
1271         Q_ASSERT(ok);
1272
1273         // if the condition below doesn't hold anymore we need to modify our
1274         // lower/upper/title casing code and case folding code
1275         Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1276
1277 //         qDebug() << "codepoint" << hex << codepoint;
1278 //         qDebug() << line;
1279
1280         QList<QByteArray> lower = l[1].trimmed().split(' ');
1281         QList<int> lowerMap;
1282         for (int i = 0; i < lower.size(); ++i) {
1283             bool ok;
1284             lowerMap.append(lower.at(i).toInt(&ok, 16));
1285             Q_ASSERT(ok);
1286         }
1287
1288         QList<QByteArray> title = l[2].trimmed().split(' ');
1289         QList<int> titleMap;
1290         for (int i = 0; i < title.size(); ++i) {
1291             bool ok;
1292             titleMap.append(title.at(i).toInt(&ok, 16));
1293             Q_ASSERT(ok);
1294         }
1295
1296         QList<QByteArray> upper = l[3].trimmed().split(' ');
1297         QList<int> upperMap;
1298         for (int i = 0; i < upper.size(); ++i) {
1299             bool ok;
1300             upperMap.append(upper.at(i).toInt(&ok, 16));
1301             Q_ASSERT(ok);
1302         }
1303
1304
1305         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1306
1307         Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1308         Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1309         Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1310
1311         if (lowerMap.size() > 1) {
1312             ud.p.lowerCaseSpecial = true;
1313             ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1314         }
1315         if (titleMap.size() > 1) {
1316             ud.p.titleCaseSpecial = true;
1317             ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1318         }
1319         if (upperMap.size() > 1) {
1320             ud.p.upperCaseSpecial = true;
1321             ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1322         }
1323
1324         unicodeData.insert(codepoint, ud);
1325     }
1326 }
1327
1328 static int maxCaseFoldDiff = 0;
1329
1330 static void readCaseFolding()
1331 {
1332     qDebug() << "Reading CaseFolding.txt";
1333     QFile f("data/CaseFolding.txt");
1334     if (!f.exists())
1335         qFatal("Couldn't find CaseFolding.txt");
1336
1337     f.open(QFile::ReadOnly);
1338
1339     while (!f.atEnd()) {
1340         QByteArray line;
1341         line.resize(1024);
1342         int len = f.readLine(line.data(), 1024);
1343         line.resize(len-1);
1344
1345         int comment = line.indexOf('#');
1346         if (comment >= 0)
1347             line = line.left(comment);
1348
1349         if (line.isEmpty())
1350             continue;
1351
1352         QList<QByteArray> l = line.split(';');
1353
1354         bool ok;
1355         int codepoint = l[0].trimmed().toInt(&ok, 16);
1356         Q_ASSERT(ok);
1357
1358
1359         l[1] = l[1].trimmed();
1360         if (l[1] == "F" || l[1] == "T")
1361             continue;
1362
1363 //         qDebug() << "codepoint" << hex << codepoint;
1364 //         qDebug() << line;
1365         QList<QByteArray> fold = l[2].trimmed().split(' ');
1366         QList<int> foldMap;
1367         for (int i = 0; i < fold.size(); ++i) {
1368             bool ok;
1369             foldMap.append(fold.at(i).toInt(&ok, 16));
1370             Q_ASSERT(ok);
1371         }
1372
1373         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1374         if (foldMap.size() == 1) {
1375             int caseFolded = foldMap.at(0);
1376             int diff = caseFolded - codepoint;
1377             if (qAbs(diff) >= (1<<14)) {
1378                 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1379                 // if the condition below doesn't hold anymore we need to modify our special case folding code in qchar.cpp
1380                 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(caseFolded));
1381                 ud.p.caseFoldSpecial = true;
1382                 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1383             } else {
1384                 ud.p.caseFoldDiff = diff;
1385                 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1386             }
1387             if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(caseFolded)) {
1388                 // if the conditions below doesn't hold anymore we need to modify our case folding code
1389                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1390                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1391             }
1392 //            if (caseFolded != codepoint + ud.p.lowerCaseDiff)
1393 //                qDebug() << hex << codepoint;
1394         } else {
1395             qFatal("we currently don't support full case foldings");
1396 //             qDebug() << "special" << hex << foldMap;
1397             ud.p.caseFoldSpecial = true;
1398             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1399         }
1400         unicodeData.insert(codepoint, ud);
1401     }
1402 }
1403
1404 static void readGraphemeBreak()
1405 {
1406     qDebug() << "Reading GraphemeBreakProperty.txt";
1407     QFile f("data/GraphemeBreakProperty.txt");
1408     if (!f.exists())
1409         qFatal("Couldn't find GraphemeBreakProperty.txt");
1410
1411     f.open(QFile::ReadOnly);
1412
1413     while (!f.atEnd()) {
1414         QByteArray line;
1415         line.resize(1024);
1416         int len = f.readLine(line.data(), 1024);
1417         line.resize(len-1);
1418
1419         int comment = line.indexOf('#');
1420         if (comment >= 0)
1421             line = line.left(comment);
1422         line.replace(" ", "");
1423
1424         if (line.isEmpty())
1425             continue;
1426
1427         QList<QByteArray> l = line.split(';');
1428         Q_ASSERT(l.size() == 2);
1429
1430         QByteArray codes = l[0];
1431         codes.replace("..", ".");
1432         QList<QByteArray> cl = codes.split('.');
1433
1434         bool ok;
1435         int from = cl[0].toInt(&ok, 16);
1436         Q_ASSERT(ok);
1437         int to = from;
1438         if (cl.size() == 2) {
1439             to = cl[1].toInt(&ok, 16);
1440             Q_ASSERT(ok);
1441         }
1442
1443         GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1444         if (brk == GraphemeBreak_Unassigned)
1445             qFatal("unassigned grapheme break class: %s", l[1].constData());
1446
1447         for (int codepoint = from; codepoint <= to; ++codepoint) {
1448             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1449             ud.p.graphemeBreak = brk;
1450             unicodeData.insert(codepoint, ud);
1451         }
1452     }
1453 }
1454
1455 static void readWordBreak()
1456 {
1457     qDebug() << "Reading WordBreakProperty.txt";
1458     QFile f("data/WordBreakProperty.txt");
1459     if (!f.exists())
1460         qFatal("Couldn't find WordBreakProperty.txt");
1461
1462     f.open(QFile::ReadOnly);
1463
1464     while (!f.atEnd()) {
1465         QByteArray line;
1466         line.resize(1024);
1467         int len = f.readLine(line.data(), 1024);
1468         line.resize(len-1);
1469
1470         int comment = line.indexOf('#');
1471         if (comment >= 0)
1472             line = line.left(comment);
1473         line.replace(" ", "");
1474
1475         if (line.isEmpty())
1476             continue;
1477
1478         QList<QByteArray> l = line.split(';');
1479         Q_ASSERT(l.size() == 2);
1480
1481         QByteArray codes = l[0];
1482         codes.replace("..", ".");
1483         QList<QByteArray> cl = codes.split('.');
1484
1485         bool ok;
1486         int from = cl[0].toInt(&ok, 16);
1487         Q_ASSERT(ok);
1488         int to = from;
1489         if (cl.size() == 2) {
1490             to = cl[1].toInt(&ok, 16);
1491             Q_ASSERT(ok);
1492         }
1493
1494         WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned);
1495         if (brk == WordBreak_Unassigned)
1496             qFatal("unassigned word break class: %s", l[1].constData());
1497
1498         for (int codepoint = from; codepoint <= to; ++codepoint) {
1499             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1500             ud.p.wordBreak = brk;
1501             unicodeData.insert(codepoint, ud);
1502         }
1503     }
1504 }
1505
1506 static void readSentenceBreak()
1507 {
1508     qDebug() << "Reading SentenceBreakProperty.txt";
1509     QFile f("data/SentenceBreakProperty.txt");
1510     if (!f.exists())
1511         qFatal("Couldn't find SentenceBreakProperty.txt");
1512
1513     f.open(QFile::ReadOnly);
1514
1515     while (!f.atEnd()) {
1516         QByteArray line;
1517         line.resize(1024);
1518         int len = f.readLine(line.data(), 1024);
1519         line.resize(len-1);
1520
1521         int comment = line.indexOf('#');
1522         if (comment >= 0)
1523             line = line.left(comment);
1524         line.replace(" ", "");
1525
1526         if (line.isEmpty())
1527             continue;
1528
1529         QList<QByteArray> l = line.split(';');
1530         Q_ASSERT(l.size() == 2);
1531
1532         QByteArray codes = l[0];
1533         codes.replace("..", ".");
1534         QList<QByteArray> cl = codes.split('.');
1535
1536         bool ok;
1537         int from = cl[0].toInt(&ok, 16);
1538         Q_ASSERT(ok);
1539         int to = from;
1540         if (cl.size() == 2) {
1541             to = cl[1].toInt(&ok, 16);
1542             Q_ASSERT(ok);
1543         }
1544
1545         SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
1546         if (brk == SentenceBreak_Unassigned)
1547             qFatal("unassigned sentence break class: %s", l[1].constData());
1548
1549         for (int codepoint = from; codepoint <= to; ++codepoint) {
1550             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1551             ud.p.sentenceBreak = brk;
1552             unicodeData.insert(codepoint, ud);
1553         }
1554     }
1555 }
1556
1557 #if 0
1558 // this piece of code does full case folding and comparison. We currently
1559 // don't use it, since this gives lots of issues with things as case insensitive
1560 // search and replace.
1561 static inline void foldCase(uint ch, ushort *out)
1562 {
1563     const QUnicodeTables::Properties *p = qGetProp(ch);
1564     if (!p->caseFoldSpecial) {
1565         *(out++) = ch + p->caseFoldDiff;
1566     } else {
1567         const ushort *folded = specialCaseMap + p->caseFoldDiff;
1568         ushort length = *folded++;
1569         while (length--)
1570             *out++ = *folded++;
1571     }
1572     *out = 0;
1573 }
1574
1575 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1576 {
1577     if (a == b)
1578         return 0;
1579     if (a == 0)
1580         return 1;
1581     if (b == 0)
1582         return -1;
1583
1584     while (a != ae && b != be) {
1585         const QUnicodeTables::Properties *pa = qGetProp(*a);
1586         const QUnicodeTables::Properties *pb = qGetProp(*b);
1587         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1588             goto special;
1589             int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1590         if ((diff))
1591             return diff;
1592         ++a;
1593         ++b;
1594         }
1595     }
1596     if (a == ae) {
1597         if (b == be)
1598             return 0;
1599         return -1;
1600     }
1601     return 1;
1602 special:
1603     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1604     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1605     abuf[0] = bbuf[0] = 0;
1606     ushort *ap = abuf;
1607     ushort *bp = bbuf;
1608     while (1) {
1609         if (!*ap) {
1610             if (a == ae) {
1611                 if (!*bp && b == be)
1612                     return 0;
1613                 return -1;
1614             }
1615             foldCase(*(a++), abuf);
1616             ap = abuf;
1617         }
1618         if (!*bp) {
1619             if (b == be)
1620                 return 1;
1621             foldCase(*(b++), bbuf);
1622             bp = bbuf;
1623         }
1624         if (*ap != *bp)
1625             return (int)*ap - (int)*bp;
1626         ++ap;
1627         ++bp;
1628     }
1629 }
1630
1631
1632 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1633 {
1634     if (a == 0)
1635         return 1;
1636     if (b == 0)
1637         return -1;
1638
1639     while (a != ae && *b) {
1640         const QUnicodeTables::Properties *pa = qGetProp(*a);
1641         const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1642         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1643             goto special;
1644         int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1645         if ((diff))
1646             return diff;
1647         ++a;
1648         ++b;
1649     }
1650     if (a == ae) {
1651         if (!*b)
1652             return 0;
1653         return -1;
1654     }
1655     return 1;
1656
1657 special:
1658     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1659     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1660     abuf[0] = bbuf[0] = 0;
1661     ushort *ap = abuf;
1662     ushort *bp = bbuf;
1663     while (1) {
1664         if (!*ap) {
1665             if (a == ae) {
1666                 if (!*bp && !*b)
1667                     return 0;
1668                 return -1;
1669             }
1670             foldCase(*(a++), abuf);
1671             ap = abuf;
1672         }
1673         if (!*bp) {
1674             if (!*b)
1675                 return 1;
1676             foldCase(*(b++), bbuf);
1677             bp = bbuf;
1678         }
1679         if (*ap != *bp)
1680             return (int)*ap - (int)*bp;
1681         ++ap;
1682         ++bp;
1683     }
1684 }
1685 #endif
1686
1687 #if 0
1688 static QList<QByteArray> blockNames;
1689 struct BlockInfo
1690 {
1691     int blockIndex;
1692     int firstCodePoint;
1693     int lastCodePoint;
1694 };
1695 static QList<BlockInfo> blockInfoList;
1696
1697 static void readBlocks()
1698 {
1699     qDebug() << "Reading Blocks.txt";
1700     QFile f("data/Blocks.txt");
1701     if (!f.exists())
1702         qFatal("Couldn't find Blocks.txt");
1703
1704     f.open(QFile::ReadOnly);
1705
1706     while (!f.atEnd()) {
1707         QByteArray line = f.readLine();
1708         line.resize(line.size() - 1);
1709
1710         int comment = line.indexOf("#");
1711         if (comment >= 0)
1712             line = line.left(comment);
1713
1714         line.replace(" ", "");
1715
1716         if (line.isEmpty())
1717             continue;
1718
1719         int semicolon = line.indexOf(';');
1720         Q_ASSERT(semicolon >= 0);
1721         QByteArray codePoints = line.left(semicolon);
1722         QByteArray blockName = line.mid(semicolon + 1);
1723
1724         int blockIndex = blockNames.indexOf(blockName);
1725         if (blockIndex == -1) {
1726             blockIndex = blockNames.size();
1727             blockNames.append(blockName);
1728         }
1729
1730         codePoints.replace("..", ".");
1731         QList<QByteArray> cl = codePoints.split('.');
1732
1733         bool ok;
1734         int first = cl[0].toInt(&ok, 16);
1735         Q_ASSERT(ok);
1736         int last = first;
1737         if (cl.size() == 2) {
1738             last = cl[1].toInt(&ok, 16);
1739             Q_ASSERT(ok);
1740         }
1741
1742         BlockInfo blockInfo = { blockIndex, first, last };
1743         blockInfoList.append(blockInfo);
1744     }
1745 }
1746 #endif
1747
1748 static QList<QByteArray> scriptNames;
1749 static QHash<int, int> scriptAssignment;
1750 static QHash<int, int> scriptHash;
1751
1752 struct ExtraBlock {
1753     int block;
1754     QVector<int> vector;
1755 };
1756
1757 static QList<ExtraBlock> extraBlockList;
1758
1759
1760 static void readScripts()
1761 {
1762     scriptNames.append("Common");
1763
1764     static const char *files[] = {
1765         "data/ScriptsInitial.txt",
1766         "data/Scripts.txt",
1767         "data/ScriptsCorrections.txt"
1768     };
1769     enum { fileCount = sizeof(files) / sizeof(const char *) };
1770
1771     for (int i = 0; i < fileCount; ++i) {
1772         QFile f(files[i]);
1773         if (!f.exists())
1774             qFatal("Couldn't find %s", files[i]);
1775
1776         f.open(QFile::ReadOnly);
1777
1778         while (!f.atEnd()) {
1779             QByteArray line = f.readLine();
1780             line.resize(line.size() - 1);
1781
1782             int comment = line.indexOf("#");
1783             if (comment >= 0)
1784                 line = line.left(comment);
1785
1786             line.replace(" ", "");
1787             line.replace("_", "");
1788
1789             if (line.isEmpty())
1790                 continue;
1791
1792             int semicolon = line.indexOf(';');
1793             Q_ASSERT(semicolon >= 0);
1794             QByteArray codePoints = line.left(semicolon);
1795             QByteArray scriptName = line.mid(semicolon + 1);
1796
1797             int scriptIndex = scriptNames.indexOf(scriptName);
1798             if (scriptIndex == -1) {
1799                 scriptIndex = scriptNames.size();
1800                 scriptNames.append(scriptName);
1801             }
1802
1803             codePoints.replace("..", ".");
1804             QList<QByteArray> cl = codePoints.split('.');
1805
1806             bool ok;
1807             int first = cl[0].toInt(&ok, 16);
1808             Q_ASSERT(ok);
1809             int last = first;
1810             if (cl.size() == 2) {
1811                 last = cl[1].toInt(&ok, 16);
1812                 Q_ASSERT(ok);
1813             }
1814
1815             for (int i = first; i <= last; ++i)
1816                 scriptAssignment[i] = scriptIndex;
1817         }
1818     }
1819 }
1820
1821
1822 static int scriptSentinel = 0;
1823
1824 QByteArray createScriptEnumDeclaration()
1825 {
1826     static const char *specialScripts[] = {
1827         "Common",
1828         "Arabic",
1829         "Armenian",
1830         "Bengali",
1831         "Cyrillic",
1832         "Devanagari",
1833         "Georgian",
1834         "Greek",
1835         "Gujarati",
1836         "Gurmukhi",
1837         "Hangul",
1838         "Hebrew",
1839         "Kannada",
1840         "Khmer",
1841         "Lao",
1842         "Malayalam",
1843         "Myanmar",
1844         "Nko",
1845         "Ogham",
1846         "Oriya",
1847         "Runic",
1848         "Sinhala",
1849         "Syriac",
1850         "Tamil",
1851         "Telugu",
1852         "Thaana",
1853         "Thai",
1854         "Tibetan",
1855         "Inherited"
1856     };
1857     const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
1858
1859     // generate script enum
1860     QByteArray declaration;
1861
1862     declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n";
1863     declaration += "    enum Script {\n        Common";
1864
1865     int uniqueScripts = 1; // Common
1866
1867     // output the ones with special processing first
1868     for (int i = 1; i < scriptNames.size(); ++i) {
1869         QByteArray scriptName = scriptNames.at(i);
1870         // does the script require special processing?
1871         bool special = false;
1872         for (int s = 0; s < specialScriptsCount; ++s) {
1873             if (scriptName == specialScripts[s]) {
1874                 special = true;
1875                 break;
1876             }
1877         }
1878         if (!special) {
1879             scriptHash[i] = 0; // alias for 'Common'
1880             continue;
1881         } else {
1882             ++uniqueScripts;
1883             scriptHash[i] = i;
1884         }
1885
1886         if (scriptName != "Inherited") {
1887             declaration += ",\n        ";
1888             declaration += scriptName;
1889         }
1890     }
1891     declaration += ",\n        Inherited";
1892     declaration += ",\n        ScriptCount = Inherited";
1893
1894     // output the ones that are an alias for 'Common'
1895     for (int i = 1; i < scriptNames.size(); ++i) {
1896         if (scriptHash.value(i) != 0)
1897             continue;
1898         declaration += ",\n        ";
1899         declaration += scriptNames.at(i);
1900         declaration += " = Common";
1901     }
1902
1903     declaration += "\n    };\n";
1904
1905     scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
1906     declaration += "    enum { ScriptSentinel = ";
1907     declaration += QByteArray::number(scriptSentinel);
1908     declaration += " };\n\n";
1909     return declaration;
1910 }
1911
1912 QByteArray createScriptTableDeclaration()
1913 {
1914     Q_ASSERT(scriptSentinel > 0);
1915
1916     QByteArray declaration;
1917
1918     const int unicodeBlockCount = 512; // number of unicode blocks
1919     const int unicodeBlockSize = 128; // size of each block
1920     declaration = "enum { UnicodeBlockCount = ";
1921     declaration += QByteArray::number(unicodeBlockCount);
1922     declaration += " }; // number of unicode blocks\n";
1923     declaration += "enum { UnicodeBlockSize = ";
1924     declaration += QByteArray::number(unicodeBlockSize);
1925     declaration += " }; // size of each block\n\n";
1926
1927     // script table
1928     declaration += "static const unsigned char uc_scripts[] = {\n";
1929     for (int i = 0; i < unicodeBlockCount; ++i) {
1930         int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
1931         int blockAssignment[unicodeBlockSize];
1932         for (int x = 0; x < unicodeBlockSize; ++x) {
1933             int codePoint = (i << 7) | x;
1934             blockAssignment[x] = scriptAssignment.value(codePoint, 0);
1935         }
1936         bool allTheSame = true;
1937         const int originalScript = blockAssignment[0];
1938         const int script = scriptHash.value(originalScript);
1939         for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
1940             const int s = scriptHash.value(blockAssignment[x]);
1941             if (s != script)
1942                 allTheSame = false;
1943         }
1944
1945         if (allTheSame) {
1946             declaration += "    ";
1947             declaration += scriptNames.value(originalScript);
1948             declaration += ", /* U+";
1949             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1950             declaration += '-';
1951             declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1952             declaration += " */\n";
1953         } else {
1954             const int value = extraBlockList.size() + scriptSentinel;
1955             const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1956
1957             declaration += "    ";
1958             declaration += QByteArray::number(value);
1959             declaration += ", /* U+";
1960             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1961             declaration += '-';
1962             declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1963             declaration += " at offset ";
1964             declaration += QByteArray::number(offset);
1965             declaration += " */\n";
1966
1967             ExtraBlock extraBlock;
1968             extraBlock.block = block;
1969             extraBlock.vector.resize(unicodeBlockSize);
1970             for (int x = 0; x < unicodeBlockSize; ++x)
1971                 extraBlock.vector[x] = blockAssignment[x];
1972
1973             extraBlockList.append(extraBlock);
1974         }
1975     }
1976
1977     for (int i = 0; i < extraBlockList.size(); ++i) {
1978         const int value = i + scriptSentinel;
1979         const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1980         const ExtraBlock &extraBlock = extraBlockList.at(i);
1981         const int block = extraBlock.block;
1982
1983         declaration += "\n\n    /* U+";
1984         declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1985         declaration += '-';
1986         declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1987         declaration += " at offset ";
1988         declaration += QByteArray::number(offset);
1989         declaration += " */\n    ";
1990
1991         for (int x = 0; x < extraBlock.vector.size(); ++x) {
1992             const int o = extraBlock.vector.at(x);
1993
1994             declaration += scriptNames.value(o);
1995             if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
1996                 declaration += ',';
1997             if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
1998                 declaration += "\n    ";
1999             else
2000                 declaration += ' ';
2001         }
2002         if (declaration.endsWith(' '))
2003             declaration.chop(1);
2004     }
2005     declaration += "\n};\n\n";
2006
2007     declaration +=
2008             "Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4)\n"
2009             "{\n"
2010             "    if (ucs4 > 0xffff)\n"
2011             "        return Common;\n"
2012             "    int script = uc_scripts[ucs4 >> 7];\n"
2013             "    if (script < ScriptSentinel)\n"
2014             "        return script;\n"
2015             "    script = (((script - ScriptSentinel) * UnicodeBlockSize) + UnicodeBlockCount);\n"
2016             "    script = uc_scripts[script + (ucs4 & 0x7f)];\n"
2017             "    return script;\n"
2018             "}\n\n";
2019
2020     qDebug("createScriptTableDeclaration:");
2021     qDebug("    memory usage: %d bytes", unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
2022
2023     return declaration;
2024 }
2025
2026 #if 0
2027 static void dump(int from, int to)
2028 {
2029     for (int i = from; i <= to; ++i) {
2030         UnicodeData d = unicodeData.value(i, UnicodeData(i));
2031         qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
2032                i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
2033         if (d.decompositionType != QChar::NoDecomposition) {
2034             qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
2035                    d.decomposition[0]);
2036         }
2037     }
2038     qDebug(" ");
2039 }
2040 #endif
2041
2042 struct PropertyBlock {
2043     PropertyBlock() { index = -1; }
2044     int index;
2045     QList<int> properties;
2046     bool operator==(const PropertyBlock &other)
2047     { return properties == other.properties; }
2048 };
2049
2050 static QByteArray createPropertyInfo()
2051 {
2052     qDebug("createPropertyInfo:");
2053
2054     // we reserve one bit more than in the assert below for the sign
2055     Q_ASSERT(maxMirroredDiff < (1<<12));
2056     Q_ASSERT(maxLowerCaseDiff < (1<<14));
2057     Q_ASSERT(maxUpperCaseDiff < (1<<14));
2058     Q_ASSERT(maxTitleCaseDiff < (1<<14));
2059     Q_ASSERT(maxCaseFoldDiff < (1<<14));
2060
2061     const int BMP_BLOCKSIZE = 32;
2062     const int BMP_SHIFT = 5;
2063     const int BMP_END = 0x11000;
2064     const int SMP_END = 0x110000;
2065     const int SMP_BLOCKSIZE = 256;
2066     const int SMP_SHIFT = 8;
2067
2068     QList<PropertyBlock> blocks;
2069     QList<int> blockMap;
2070
2071     int used = 0;
2072
2073     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2074         PropertyBlock b;
2075         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2076             int uc = block*BMP_BLOCKSIZE + i;
2077             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2078             b.properties.append(d.propertyIndex);
2079         }
2080         int index = blocks.indexOf(b);
2081         if (index == -1) {
2082             index = blocks.size();
2083             b.index = used;
2084             used += BMP_BLOCKSIZE;
2085             blocks.append(b);
2086         }
2087         blockMap.append(blocks.at(index).index);
2088     }
2089
2090     int bmp_blocks = blocks.size();
2091     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2092
2093     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2094         PropertyBlock b;
2095         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2096             int uc = block*SMP_BLOCKSIZE + i;
2097             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2098             b.properties.append(d.propertyIndex);
2099         }
2100         int index = blocks.indexOf(b);
2101         if (index == -1) {
2102             index = blocks.size();
2103             b.index = used;
2104             used += SMP_BLOCKSIZE;
2105             blocks.append(b);
2106         }
2107         blockMap.append(blocks.at(index).index);
2108     }
2109
2110     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2111     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2112     int bmp_mem = bmp_block_data + bmp_trie;
2113     qDebug("    %d unique blocks in BMP.", blocks.size());
2114     qDebug("        block data uses: %d bytes", bmp_block_data);
2115     qDebug("        trie data uses : %d bytes", bmp_trie);
2116
2117     int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2118     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2119     int smp_mem = smp_block_data + smp_trie;
2120     qDebug("    %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2121     qDebug("        block data uses: %d bytes", smp_block_data);
2122     qDebug("        trie data uses : %d bytes", smp_trie);
2123
2124     qDebug("\n        properties uses : %d bytes", uniqueProperties.size() * SizeOfPropertiesStruct);
2125     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + uniqueProperties.size() * SizeOfPropertiesStruct);
2126
2127     QByteArray out;
2128     out += "static const unsigned short uc_property_trie[] = {\n";
2129
2130     // first write the map
2131     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2132     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2133         if (!(i % 8)) {
2134             if (out.endsWith(' '))
2135                 out.chop(1);
2136             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2137                 out += "\n";
2138             out += "\n    ";
2139         }
2140         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2141         out += ", ";
2142     }
2143     if (out.endsWith(' '))
2144         out.chop(1);
2145     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2146     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2147         if (!(i % 8)) {
2148             if (out.endsWith(' '))
2149                 out.chop(1);
2150             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2151                 out += "\n";
2152             out += "\n    ";
2153         }
2154         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2155         out += ", ";
2156     }
2157     if (out.endsWith(' '))
2158         out.chop(1);
2159     out += "\n";
2160     // write the data
2161     for (int i = 0; i < blocks.size(); ++i) {
2162         if (out.endsWith(' '))
2163             out.chop(1);
2164         out += "\n";
2165         const PropertyBlock &b = blocks.at(i);
2166         for (int j = 0; j < b.properties.size(); ++j) {
2167             if (!(j % 8)) {
2168                 if (out.endsWith(' '))
2169                     out.chop(1);
2170                 out += "\n    ";
2171             }
2172             out += QByteArray::number(b.properties.at(j));
2173             out += ", ";
2174         }
2175     }
2176     if (out.endsWith(' '))
2177         out.chop(1);
2178     out += "\n};\n\n"
2179
2180            "#define GET_PROP_INDEX(ucs4) \\\n"
2181            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2182            "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2183            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2184            "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2185            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2186            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2187            "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2188            "       (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2189            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
2190
2191            "static const Properties uc_properties[] = {";
2192
2193     // keep in sync with the property declaration
2194     for (int i = 0; i < uniqueProperties.size(); ++i) {
2195         PropertyFlags p = uniqueProperties.at(i);
2196         out += "\n    { ";
2197 //     "        ushort category : 8;\n"
2198         out += QByteArray::number( p.category );
2199         out += ", ";
2200 //     "        ushort line_break_class : 8;\n"
2201         out += QByteArray::number( p.line_break_class );
2202         out += ", ";
2203 //     "        ushort direction : 8;\n"
2204         out += QByteArray::number( p.direction );
2205         out += ", ";
2206 //     "        ushort combiningClass :8;\n"
2207         out += QByteArray::number( p.combiningClass );
2208         out += ", ";
2209 //     "        ushort joining : 2;\n"
2210         out += QByteArray::number( p.joining );
2211         out += ", ";
2212 //     "        signed short digitValue : 6;\n /* 5 needed */"
2213         out += QByteArray::number( p.digitValue );
2214         out += ", ";
2215 //     "        ushort unicodeVersion : 4;\n"
2216         out += QByteArray::number( p.age );
2217         out += ", ";
2218 //     "        ushort lowerCaseSpecial : 1;\n"
2219 //     "        ushort upperCaseSpecial : 1;\n"
2220 //     "        ushort titleCaseSpecial : 1;\n"
2221 //     "        ushort caseFoldSpecial : 1;\n"
2222         out += QByteArray::number( p.lowerCaseSpecial );
2223         out += ", ";
2224         out += QByteArray::number( p.upperCaseSpecial );
2225         out += ", ";
2226         out += QByteArray::number( p.titleCaseSpecial );
2227         out += ", ";
2228         out += QByteArray::number( p.caseFoldSpecial );
2229         out += ", ";
2230 //     "        signed short mirrorDiff : 16;\n"
2231 //     "        signed short lowerCaseDiff : 16;\n"
2232 //     "        signed short upperCaseDiff : 16;\n"
2233 //     "        signed short titleCaseDiff : 16;\n"
2234 //     "        signed short caseFoldDiff : 16;\n"
2235         out += QByteArray::number( p.mirrorDiff );
2236         out += ", ";
2237         out += QByteArray::number( p.lowerCaseDiff );
2238         out += ", ";
2239         out += QByteArray::number( p.upperCaseDiff );
2240         out += ", ";
2241         out += QByteArray::number( p.titleCaseDiff );
2242         out += ", ";
2243         out += QByteArray::number( p.caseFoldDiff );
2244         out += ", ";
2245         out += QByteArray::number( p.graphemeBreak );
2246         out += ", ";
2247         out += QByteArray::number( p.wordBreak );
2248         out += ", ";
2249         out += QByteArray::number( p.sentenceBreak );
2250         out += " },";
2251     }
2252     out.chop(1);
2253     out += "\n};\n\n";
2254
2255
2256     out += "static inline const Properties *qGetProp(uint ucs4)\n"
2257            "{\n"
2258            "    const int index = GET_PROP_INDEX(ucs4);\n"
2259            "    return uc_properties + index;\n"
2260            "}\n"
2261            "\n"
2262            "static inline const Properties *qGetProp(ushort ucs2)\n"
2263            "{\n"
2264            "    const int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2265            "    return uc_properties + index;\n"
2266            "}\n"
2267            "\n"
2268            "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4)\n"
2269            "{\n"
2270            "    return qGetProp(ucs4);\n"
2271            "}\n"
2272            "\n"
2273            "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2)\n"
2274            "{\n"
2275            "    return qGetProp(ucs2);\n"
2276            "}\n\n";
2277
2278     out += "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4)\n"
2279            "{\n"
2280            "    return (LineBreakClass)qGetProp(ucs4)->line_break_class;\n"
2281            "}\n\n";
2282
2283     return out;
2284 }
2285
2286 static QByteArray createSpecialCaseMap()
2287 {
2288     qDebug("createSpecialCaseMap:");
2289
2290     QByteArray out;
2291     out += "static const ushort specialCaseMap[] = {\n"
2292            "    0x0, // placeholder";
2293     int i = 1;
2294     while (i < specialCaseMap.size()) {
2295         out += "\n   ";
2296         int n = specialCaseMap.at(i);
2297         for (int j = 0; j <= n; ++j) {
2298             out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2299             out += ",";
2300         }
2301         i += n + 1;
2302     }
2303     out.chop(1);
2304     out += "\n};\n\n";
2305
2306     qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2);
2307
2308     return out;
2309 }
2310
2311
2312 struct DecompositionBlock {
2313     DecompositionBlock() { index = -1; }
2314     int index;
2315     QList<int> decompositionPositions;
2316     bool operator ==(const DecompositionBlock &other)
2317     { return decompositionPositions == other.decompositionPositions; }
2318 };
2319
2320 static QByteArray createCompositionInfo()
2321 {
2322     qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2323
2324     const int BMP_BLOCKSIZE = 16;
2325     const int BMP_SHIFT = 4;
2326     const int BMP_END = 0x3400; // start of Han
2327     const int SMP_END = 0x30000;
2328     const int SMP_BLOCKSIZE = 256;
2329     const int SMP_SHIFT = 8;
2330
2331     if (SMP_END <= highestComposedCharacter)
2332         qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2333
2334     QList<DecompositionBlock> blocks;
2335     QList<int> blockMap;
2336     QList<unsigned short> decompositions;
2337
2338     int used = 0;
2339     int tableIndex = 0;
2340
2341     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2342         DecompositionBlock b;
2343         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2344             int uc = block*BMP_BLOCKSIZE + i;
2345             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2346             if (!d.decomposition.isEmpty()) {
2347                 int utf16Length = 0;
2348                 decompositions.append(0);
2349                 for (int j = 0; j < d.decomposition.size(); ++j) {
2350                     int code = d.decomposition.at(j);
2351                     if (QChar::requiresSurrogates(code)) {
2352                         // save as surrogate pair
2353                         decompositions.append(QChar::highSurrogate(code));
2354                         decompositions.append(QChar::lowSurrogate(code));
2355                         utf16Length += 2;
2356                     } else {
2357                         decompositions.append(code);
2358                         utf16Length++;
2359                     }
2360                 }
2361                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2362                 b.decompositionPositions.append(tableIndex);
2363                 tableIndex += utf16Length + 1;
2364             } else {
2365                 b.decompositionPositions.append(0xffff);
2366             }
2367         }
2368         int index = blocks.indexOf(b);
2369         if (index == -1) {
2370             index = blocks.size();
2371             b.index = used;
2372             used += BMP_BLOCKSIZE;
2373             blocks.append(b);
2374         }
2375         blockMap.append(blocks.at(index).index);
2376     }
2377
2378     int bmp_blocks = blocks.size();
2379     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2380
2381     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2382         DecompositionBlock b;
2383         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2384             int uc = block*SMP_BLOCKSIZE + i;
2385             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2386             if (!d.decomposition.isEmpty()) {
2387                 int utf16Length = 0;
2388                 decompositions.append(0);
2389                 for (int j = 0; j < d.decomposition.size(); ++j) {
2390                     int code = d.decomposition.at(j);
2391                     if (QChar::requiresSurrogates(code)) {
2392                         // save as surrogate pair
2393                         decompositions.append(QChar::highSurrogate(code));
2394                         decompositions.append(QChar::lowSurrogate(code));
2395                         utf16Length += 2;
2396                     } else {
2397                         decompositions.append(code);
2398                         utf16Length++;
2399                     }
2400                 }
2401                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2402                 b.decompositionPositions.append(tableIndex);
2403                 tableIndex += utf16Length + 1;
2404             } else {
2405                 b.decompositionPositions.append(0xffff);
2406             }
2407         }
2408         int index = blocks.indexOf(b);
2409         if (index == -1) {
2410             index = blocks.size();
2411             b.index = used;
2412             used += SMP_BLOCKSIZE;
2413             blocks.append(b);
2414         }
2415         blockMap.append(blocks.at(index).index);
2416     }
2417
2418     // if the condition below doesn't hold anymore we need to modify our decomposition code
2419     Q_ASSERT(tableIndex < 0xffff);
2420
2421     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2422     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2423     int bmp_mem = bmp_block_data + bmp_trie;
2424     qDebug("    %d unique blocks in BMP.", blocks.size());
2425     qDebug("        block data uses: %d bytes", bmp_block_data);
2426     qDebug("        trie data uses : %d bytes", bmp_trie);
2427     qDebug("        memory usage: %d bytes", bmp_mem);
2428
2429     int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2430     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2431     int smp_mem = smp_block_data + smp_trie;
2432     qDebug("    %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2433     qDebug("        block data uses: %d bytes", smp_block_data);
2434     qDebug("        trie data uses : %d bytes", smp_trie);
2435
2436     qDebug("\n        decomposition table uses : %d bytes", decompositions.size()*2);
2437     qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
2438
2439     QByteArray out;
2440
2441     out += "static const unsigned short uc_decomposition_trie[] = {\n";
2442
2443     // first write the map
2444     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2445     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2446         if (!(i % 8)) {
2447             if (out.endsWith(' '))
2448                 out.chop(1);
2449             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2450                 out += "\n";
2451             out += "\n    ";
2452         }
2453         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2454         out += ", ";
2455     }
2456     if (out.endsWith(' '))
2457         out.chop(1);
2458     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2459     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2460         if (!(i % 8)) {
2461             if (out.endsWith(' '))
2462                 out.chop(1);
2463             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2464                 out += "\n";
2465             out += "\n    ";
2466         }
2467         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2468         out += ", ";
2469     }
2470     if (out.endsWith(' '))
2471         out.chop(1);
2472     out += "\n";
2473     // write the data
2474     for (int i = 0; i < blocks.size(); ++i) {
2475         if (out.endsWith(' '))
2476             out.chop(1);
2477         out += "\n";
2478         const DecompositionBlock &b = blocks.at(i);
2479         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2480             if (!(j % 8)) {
2481                 if (out.endsWith(' '))
2482                     out.chop(1);
2483                 out += "\n    ";
2484             }
2485             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2486             out += ", ";
2487         }
2488     }
2489     if (out.endsWith(' '))
2490         out.chop(2);
2491     out += "\n};\n\n"
2492
2493            "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2494            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2495            "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2496            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2497            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2498            "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2499            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2500            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2501            "           : 0xffff))\n\n"
2502
2503            "static const unsigned short uc_decomposition_map[] = {";
2504
2505     for (int i = 0; i < decompositions.size(); ++i) {
2506         if (!(i % 8)) {
2507             if (out.endsWith(' '))
2508                 out.chop(1);
2509             out += "\n    ";
2510         }
2511         out += "0x" + QByteArray::number(decompositions.at(i), 16);
2512         out += ", ";
2513     }
2514     if (out.endsWith(' '))
2515         out.chop(2);
2516     out += "\n};\n\n";
2517
2518     return out;
2519 }
2520
2521 static QByteArray createLigatureInfo()
2522 {
2523     qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2524
2525     foreach (const QList<Ligature> &l, ligatureHashes) {
2526         for (int j = 0; j < l.size(); ++j) {
2527             // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2528             Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2529                      QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2530         }
2531     }
2532
2533     const int BMP_BLOCKSIZE = 32;
2534     const int BMP_SHIFT = 5;
2535     const int BMP_END = 0x3100;
2536     const int SMP_END = 0x12000;
2537     const int SMP_BLOCKSIZE = 256;
2538     const int SMP_SHIFT = 8;
2539
2540     if (SMP_END <= highestLigature)
2541         qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2542
2543     QList<DecompositionBlock> blocks;
2544     QList<int> blockMap;
2545     QList<unsigned short> ligatures;
2546
2547     int used = 0;
2548     int tableIndex = 0;
2549
2550     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2551         DecompositionBlock b;
2552         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2553             int uc = block*BMP_BLOCKSIZE + i;
2554             QList<Ligature> l = ligatureHashes.value(uc);
2555             if (!l.isEmpty()) {
2556                 Q_ASSERT(!QChar::requiresSurrogates(uc));
2557                 qSort(l); // needed for bsearch in ligatureHelper code
2558
2559                 ligatures.append(l.size());
2560                 for (int j = 0; j < l.size(); ++j) {
2561                     ligatures.append(l.at(j).u1);
2562                     ligatures.append(l.at(j).ligature);
2563                 }
2564                 b.decompositionPositions.append(tableIndex);
2565                 tableIndex += 2*l.size() + 1;
2566             } else {
2567                 b.decompositionPositions.append(0xffff);
2568             }
2569         }
2570         int index = blocks.indexOf(b);
2571         if (index == -1) {
2572             index = blocks.size();
2573             b.index = used;
2574             used += BMP_BLOCKSIZE;
2575             blocks.append(b);
2576         }
2577         blockMap.append(blocks.at(index).index);
2578     }
2579
2580     int bmp_blocks = blocks.size();
2581     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2582
2583     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2584         DecompositionBlock b;
2585         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2586             int uc = block*SMP_BLOCKSIZE + i;
2587             QList<Ligature> l = ligatureHashes.value(uc);
2588             if (!l.isEmpty()) {
2589                 Q_ASSERT(QChar::requiresSurrogates(uc));
2590                 qSort(l); // needed for bsearch in ligatureHelper code
2591
2592                 ligatures.append(l.size());
2593                 for (int j = 0; j < l.size(); ++j) {
2594                     ligatures.append(QChar::highSurrogate(l.at(j).u1));
2595                     ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2596                     ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2597                     ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2598                 }
2599                 b.decompositionPositions.append(tableIndex);
2600                 tableIndex += 4*l.size() + 1;
2601             } else {
2602                 b.decompositionPositions.append(0xffff);
2603             }
2604         }
2605         int index = blocks.indexOf(b);
2606         if (index == -1) {
2607             index = blocks.size();
2608             b.index = used;
2609             used += SMP_BLOCKSIZE;
2610             blocks.append(b);
2611         }
2612         blockMap.append(blocks.at(index).index);
2613     }
2614
2615     // if the condition below doesn't hold anymore we need to modify our composition code
2616     Q_ASSERT(tableIndex < 0xffff);
2617
2618     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2619     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2620     int bmp_mem = bmp_block_data + bmp_trie;
2621     qDebug("    %d unique blocks in BMP.", blocks.size());
2622     qDebug("        block data uses: %d bytes", bmp_block_data);
2623     qDebug("        trie data uses : %d bytes", bmp_trie);
2624     qDebug("        memory usage: %d bytes", bmp_mem);
2625
2626     int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2627     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2628     int smp_mem = smp_block_data + smp_trie;
2629     qDebug("    %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2630     qDebug("        block data uses: %d bytes", smp_block_data);
2631     qDebug("        trie data uses : %d bytes", smp_trie);
2632
2633     qDebug("\n        ligature data uses : %d bytes", ligatures.size()*2);
2634     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + ligatures.size() * 2);
2635
2636     QByteArray out;
2637
2638     out += "static const unsigned short uc_ligature_trie[] = {\n";
2639
2640     // first write the map
2641     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2642     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2643         if (!(i % 8)) {
2644             if (out.endsWith(' '))
2645                 out.chop(1);
2646             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2647                 out += "\n";
2648             out += "\n    ";
2649         }
2650         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2651         out += ", ";
2652     }
2653     if (out.endsWith(' '))
2654         out.chop(1);
2655     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2656     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2657         if (!(i % 8)) {
2658             if (out.endsWith(' '))
2659                 out.chop(1);
2660             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2661                 out += "\n";
2662             out += "\n    ";
2663         }
2664         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2665         out += ", ";
2666     }
2667     if (out.endsWith(' '))
2668         out.chop(1);
2669     out += "\n";
2670     // write the data
2671     for (int i = 0; i < blocks.size(); ++i) {
2672         if (out.endsWith(' '))
2673             out.chop(1);
2674         out += "\n";
2675         const DecompositionBlock &b = blocks.at(i);
2676         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2677             if (!(j % 8)) {
2678                 if (out.endsWith(' '))
2679                     out.chop(1);
2680                 out += "\n    ";
2681             }
2682             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2683             out += ", ";
2684         }
2685     }
2686     if (out.endsWith(' '))
2687         out.chop(2);
2688     out += "\n};\n\n"
2689
2690            "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2691            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2692            "        ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2693            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2694            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2695            "           ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2696            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2697            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2698            "           : 0xffff))\n\n"
2699
2700            "static const unsigned short uc_ligature_map[] = {";
2701
2702     for (int i = 0; i < ligatures.size(); ++i) {
2703         if (!(i % 8)) {
2704             if (out.endsWith(' '))
2705                 out.chop(1);
2706             out += "\n    ";
2707         }
2708         out += "0x" + QByteArray::number(ligatures.at(i), 16);
2709         out += ", ";
2710     }
2711     if (out.endsWith(' '))
2712         out.chop(2);
2713     out += "\n};\n\n";
2714
2715     return out;
2716 }
2717
2718 QByteArray createCasingInfo()
2719 {
2720     QByteArray out;
2721
2722     out += "struct CasingInfo {\n"
2723            "    uint codePoint : 16;\n"
2724            "    uint flags : 8;\n"
2725            "    uint offset : 8;\n"
2726            "};\n\n";
2727
2728     return out;
2729 }
2730
2731
2732 int main(int, char **)
2733 {
2734     initAgeMap();
2735     initCategoryMap();
2736     initDecompositionMap();
2737     initDirectionMap();
2738     initJoiningMap();
2739     initGraphemeBreak();
2740     initWordBreak();
2741     initSentenceBreak();
2742     initLineBreak();
2743
2744     readUnicodeData();
2745     readBidiMirroring();
2746     readArabicShaping();
2747     readDerivedAge();
2748     readDerivedNormalizationProps();
2749     readSpecialCasing();
2750     readCaseFolding();
2751     // readBlocks();
2752     readScripts();
2753     readGraphemeBreak();
2754     readWordBreak();
2755     readSentenceBreak();
2756     readLineBreak();
2757
2758     computeUniqueProperties();
2759     QByteArray properties = createPropertyInfo();
2760     QByteArray specialCases = createSpecialCaseMap();
2761     QByteArray compositions = createCompositionInfo();
2762     QByteArray ligatures = createLigatureInfo();
2763     QByteArray normalizationCorrections = createNormalizationCorrections();
2764     QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2765     QByteArray scriptTableDeclaration = createScriptTableDeclaration();
2766
2767     QByteArray header =
2768         "/****************************************************************************\n"
2769         "**\n"
2770         "** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).\n"
2771         "** Contact: http://www.qt-project.org/\n"
2772         "**\n"
2773         "** This file is part of the QtCore module of the Qt Toolkit.\n"
2774         "**\n"
2775         "** $QT_BEGIN_LICENSE:LGPL$\n"
2776         "** GNU Lesser General Public License Usage\n"
2777         "** This file may be used under the terms of the GNU Lesser General Public\n"
2778         "** License version 2.1 as published by the Free Software Foundation and\n"
2779         "** appearing in the file LICENSE.LGPL included in the packaging of this\n"
2780         "** file. Please review the following information to ensure the GNU Lesser\n"
2781         "** General Public License version 2.1 requirements will be met:\n"
2782         "** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2783         "**\n"
2784         "** In addition, as a special exception, Nokia gives you certain additional\n"
2785         "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2786         "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2787         "**\n"
2788         "** GNU General Public License Usage\n"
2789         "** Alternatively, this file may be used under the terms of the GNU General\n"
2790         "** Public License version 3.0 as published by the Free Software Foundation\n"
2791         "** and appearing in the file LICENSE.GPL included in the packaging of this\n"
2792         "** file. Please review the following information to ensure the GNU General\n"
2793         "** Public License version 3.0 requirements will be met:\n"
2794         "** http://www.gnu.org/copyleft/gpl.html.\n"
2795         "**\n"
2796         "** Other Usage\n"
2797         "** Alternatively, this file may be used in accordance with the terms and\n"
2798         "** conditions contained in a signed written agreement between you and Nokia.\n"
2799         "**\n"
2800         "**\n"
2801         "**\n"
2802         "**\n"
2803         "**\n"
2804         "**\n"
2805         "** $QT_END_LICENSE$\n"
2806         "**\n"
2807         "****************************************************************************/\n\n";
2808
2809     QByteArray note =
2810         "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
2811
2812     QByteArray warning =
2813         "//\n"
2814         "//  W A R N I N G\n"
2815         "//  -------------\n"
2816         "//\n"
2817         "// This file is not part of the Qt API.  It exists for the convenience\n"
2818         "// of internal files.  This header file may change from version to version\n"
2819         "// without notice, or even be removed.\n"
2820         "//\n"
2821         "// We mean it.\n"
2822         "//\n\n";
2823
2824     QFile f("../../src/corelib/tools/qunicodetables.cpp");
2825     f.open(QFile::WriteOnly|QFile::Truncate);
2826     f.write(header);
2827     f.write(note);
2828     f.write("#include \"qunicodetables_p.h\"\n\n");
2829     f.write("QT_BEGIN_NAMESPACE\n\n");
2830     f.write("namespace QUnicodeTables {\n\n");
2831     f.write(properties);
2832     f.write("\n");
2833     f.write(specialCases);
2834     f.write("\n");
2835     f.write(compositions);
2836     f.write(ligatures);
2837     f.write("\n");
2838     f.write(normalizationCorrections);
2839     f.write("\n");
2840     f.write(scriptTableDeclaration);
2841     f.write("} // namespace QUnicodeTables\n\n");
2842     f.write("using namespace QUnicodeTables;\n\n");
2843     f.write("QT_END_NAMESPACE\n");
2844     f.close();
2845
2846     f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2847     f.open(QFile::WriteOnly | QFile::Truncate);
2848     f.write(header);
2849     f.write(note);
2850     f.write(warning);
2851     f.write("#ifndef QUNICODETABLES_P_H\n"
2852             "#define QUNICODETABLES_P_H\n\n"
2853             "#include <QtCore/qchar.h>\n\n"
2854             "QT_BEGIN_NAMESPACE\n\n");
2855     f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
2856     f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n");
2857     f.write("namespace QUnicodeTables {\n\n");
2858     f.write(property_string);
2859     f.write("\n");
2860     f.write(scriptEnumDeclaration);
2861     f.write("\n");
2862     f.write(grapheme_break_string);
2863     f.write("\n");
2864     f.write(word_break_string);
2865     f.write("\n");
2866     f.write(sentence_break_string);
2867     f.write("\n");
2868     f.write(line_break_class_string);
2869     f.write("\n");
2870     f.write(methods);
2871     f.write("} // namespace QUnicodeTables\n\n"
2872             "QT_END_NAMESPACE\n\n"
2873             "#endif // QUNICODETABLES_P_H\n");
2874     f.close();
2875
2876     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
2877     qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2878     qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2879     qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2880     qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
2881 #if 0
2882 //     dump(0, 0x7f);
2883 //     dump(0x620, 0x640);
2884 //     dump(0x10000, 0x10020);
2885 //     dump(0x10800, 0x10820);
2886
2887     qDebug("decompositionLength used:");
2888     int totalcompositions = 0;
2889     int sum = 0;
2890     for (int i = 1; i < 20; ++i) {
2891         qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
2892         totalcompositions += i*decompositionLength.value(i, 0);
2893         sum += decompositionLength.value(i, 0);
2894     }
2895     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
2896            totalcompositions, (float)totalcompositions/(float)sum, sum);
2897     qDebug("highest composed character %x", highestComposedCharacter);
2898     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2899
2900     qBubbleSort(ligatures);
2901     for (int i = 0; i < ligatures.size(); ++i)
2902         qDebug("%s", ligatures.at(i).data());
2903
2904 //     qDebug("combiningClass usage:");
2905 //     int numClasses = 0;
2906 //     for (int i = 0; i < 255; ++i) {
2907 //         int num = combiningClassUsage.value(i, 0);
2908 //         if (num) {
2909 //             ++numClasses;
2910 //             qDebug("    combiningClass %d used %d times", i, num);
2911 //         }
2912 //     }
2913 //     qDebug("total of %d combining classes used", numClasses);
2914
2915 #endif
2916 }