1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the utils of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
45 #include <qbytearray.h>
51 #include <private/qunicodetables_p.h>
54 #define DATA_VERSION_S "5.0"
55 #define DATA_VERSION_STR "QChar::Unicode_5_0"
57 #define LAST_CODEPOINT 0x10ffff
58 #define LAST_CODEPOINT_STR "0x10ffff"
61 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
63 static void initAgeMap()
66 const QChar::UnicodeVersion version;
69 { QChar::Unicode_1_1, "1.1" },
70 { QChar::Unicode_2_0, "2.0" },
71 { QChar::Unicode_2_1_2, "2.1" },
72 { QChar::Unicode_3_0, "3.0" },
73 { QChar::Unicode_3_1, "3.1" },
74 { QChar::Unicode_3_2, "3.2" },
75 { QChar::Unicode_4_0, "4.0" },
76 { QChar::Unicode_4_1, "4.1" },
77 { QChar::Unicode_5_0, "5.0" },
78 { QChar::Unicode_Unassigned, 0 }
82 age_map.insert(d->age, d->version);
99 static QHash<QByteArray, Joining> joining_map;
101 static void initJoiningMap()
107 { Joining_None, "U" },
108 { Joining_Left, "L" },
109 { Joining_Causing, "C" },
110 { Joining_Dual, "D" },
111 { Joining_Right, "R" },
112 { Joining_Transparent, "T" },
113 { Joining_Unassigned, 0 }
115 JoiningList *d = joinings;
117 joining_map.insert(d->name, d->joining);
123 static const char *grapheme_break_string =
124 " enum GraphemeBreak {\n"
125 " GraphemeBreakOther,\n"
126 " GraphemeBreakCR,\n"
127 " GraphemeBreakLF,\n"
128 " GraphemeBreakControl,\n"
129 " GraphemeBreakExtend,\n"
133 " GraphemeBreakLV,\n"
134 " GraphemeBreakLVT\n"
141 GraphemeBreakControl,
149 , GraphemeBreak_Unassigned
152 static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
154 static void initGraphemeBreak()
156 struct GraphemeBreakList {
160 { GraphemeBreakOther, "Other" },
161 { GraphemeBreakCR, "CR" },
162 { GraphemeBreakLF, "LF" },
163 { GraphemeBreakControl, "Control" },
164 { GraphemeBreakExtend, "Extend" },
165 { GraphemeBreakL, "L" },
166 { GraphemeBreakV, "V" },
167 { GraphemeBreakT, "T" },
168 { GraphemeBreakLV, "LV" },
169 { GraphemeBreakLVT, "LVT" },
170 { GraphemeBreak_Unassigned, 0 }
172 GraphemeBreakList *d = breaks;
174 grapheme_break_map.insert(d->name, d->brk);
180 static const char *word_break_string =
181 " enum WordBreak {\n"
183 " WordBreakFormat,\n"
184 " WordBreakKatakana,\n"
185 " WordBreakALetter,\n"
186 " WordBreakMidLetter,\n"
187 " WordBreakMidNum,\n"
188 " WordBreakNumeric,\n"
189 " WordBreakExtendNumLet\n"
200 WordBreakExtendNumLet
202 , WordBreak_Unassigned
205 static QHash<QByteArray, WordBreak> word_break_map;
207 static void initWordBreak()
209 struct WordBreakList {
213 { WordBreakFormat, "Format" },
214 { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
215 { WordBreakKatakana, "Katakana" },
216 { WordBreakALetter, "ALetter" },
217 { WordBreakMidLetter, "MidLetter" },
218 { WordBreakMidNum, "MidNum" },
219 { WordBreakNumeric, "Numeric" },
220 { WordBreakExtendNumLet, "ExtendNumLet" },
221 { WordBreak_Unassigned, 0 }
223 WordBreakList *d = breaks;
225 word_break_map.insert(d->name, d->brk);
231 static const char *sentence_break_string =
232 " enum SentenceBreak {\n"
233 " SentenceBreakOther,\n"
234 " SentenceBreakSep,\n"
235 " SentenceBreakFormat,\n"
236 " SentenceBreakSp,\n"
237 " SentenceBreakLower,\n"
238 " SentenceBreakUpper,\n"
239 " SentenceBreakOLetter,\n"
240 " SentenceBreakNumeric,\n"
241 " SentenceBreakATerm,\n"
242 " SentenceBreakSTerm,\n"
243 " SentenceBreakClose\n"
253 SentenceBreakOLetter,
254 SentenceBreakNumeric,
259 , SentenceBreak_Unassigned
262 static QHash<QByteArray, SentenceBreak> sentence_break_map;
264 static void initSentenceBreak()
266 struct SentenceBreakList {
270 { SentenceBreakOther, "Other" },
271 { SentenceBreakSep, "Sep" },
272 { SentenceBreakFormat, "Format" },
273 { SentenceBreakSp, "Sp" },
274 { SentenceBreakLower, "Lower" },
275 { SentenceBreakUpper, "Upper" },
276 { SentenceBreakOLetter, "OLetter" },
277 { SentenceBreakNumeric, "Numeric" },
278 { SentenceBreakATerm, "ATerm" },
279 { SentenceBreakSTerm, "STerm" },
280 { SentenceBreakClose, "Close" },
281 { SentenceBreak_Unassigned, 0 }
283 SentenceBreakList *d = breaks;
285 sentence_break_map.insert(d->name, d->brk);
291 static const char *line_break_class_string =
292 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
293 " // we don't use the XX, AI and CB properties and map them to AL instead.\n"
294 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
295 " enum LineBreakClass {\n"
296 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
297 " LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
298 " LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
299 " LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
300 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
301 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
302 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
305 enum LineBreakClass {
306 LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,
307 LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,
308 LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,
309 LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,
310 LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,
311 LineBreak_JT, LineBreak_SA, LineBreak_SG,
312 LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
314 , LineBreak_Unassigned
317 static QHash<QByteArray, LineBreakClass> line_break_map;
319 static void initLineBreak()
321 // ### Classes XX and AI are left out and mapped to AL for now;
322 // ### Class NL is ignored and mapped to AL as well.
323 struct LineBreakList {
327 { LineBreak_BK, "BK" },
328 { LineBreak_CR, "CR" },
329 { LineBreak_LF, "LF" },
330 { LineBreak_CM, "CM" },
331 { LineBreak_AL, "NL" },
332 { LineBreak_SG, "SG" },
333 { LineBreak_WJ, "WJ" },
334 { LineBreak_ZW, "ZW" },
335 { LineBreak_GL, "GL" },
336 { LineBreak_SP, "SP" },
337 { LineBreak_B2, "B2" },
338 { LineBreak_BA, "BA" },
339 { LineBreak_BB, "BB" },
340 { LineBreak_HY, "HY" },
341 { LineBreak_AL, "CB" }, // ###
342 { LineBreak_CL, "CL" },
343 { LineBreak_EX, "EX" },
344 { LineBreak_IN, "IN" },
345 { LineBreak_NS, "NS" },
346 { LineBreak_OP, "OP" },
347 { LineBreak_QU, "QU" },
348 { LineBreak_IS, "IS" },
349 { LineBreak_NU, "NU" },
350 { LineBreak_PO, "PO" },
351 { LineBreak_PR, "PR" },
352 { LineBreak_SY, "SY" },
353 { LineBreak_AL, "AI" },
354 { LineBreak_AL, "AL" },
355 { LineBreak_H2, "H2" },
356 { LineBreak_H3, "H3" },
357 { LineBreak_ID, "ID" },
358 { LineBreak_JL, "JL" },
359 { LineBreak_JV, "JV" },
360 { LineBreak_JT, "JT" },
361 { LineBreak_SA, "SA" },
362 { LineBreak_AL, "XX" },
363 { LineBreak_Unassigned, 0 }
365 LineBreakList *d = breaks;
367 line_break_map.insert(d->name, d->brk);
373 // Keep this one in sync with the code in createPropertyInfo
374 static const char *property_string =
375 " struct Properties {\n"
376 " ushort category : 8; /* 5 needed */\n"
377 " ushort line_break_class : 8; /* 6 needed */\n"
378 " ushort direction : 8; /* 5 needed */\n"
379 " ushort combiningClass : 8;\n"
380 " ushort joining : 2;\n"
381 " signed short digitValue : 6; /* 5 needed */\n"
382 " ushort unicodeVersion : 4;\n"
383 " ushort lowerCaseSpecial : 1;\n"
384 " ushort upperCaseSpecial : 1;\n"
385 " ushort titleCaseSpecial : 1;\n"
386 " ushort caseFoldSpecial : 1;\n"
387 " signed short mirrorDiff : 16;\n"
388 " signed short lowerCaseDiff : 16;\n"
389 " signed short upperCaseDiff : 16;\n"
390 " signed short titleCaseDiff : 16;\n"
391 " signed short caseFoldDiff : 16;\n"
392 " ushort graphemeBreak : 8; /* 4 needed */\n"
393 " ushort wordBreak : 8; /* 4 needed */\n"
394 " ushort sentenceBreak : 8; /* 4 needed */\n"
396 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
397 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
399 static const char *methods =
400 " Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
401 " inline int lineBreakClass(QChar ch)\n"
402 " { return lineBreakClass(ch.unicode()); }\n"
404 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
405 " inline int script(QChar ch)\n"
406 " { return script(ch.unicode()); }\n\n";
408 static const int SizeOfPropertiesStruct = 20;
410 struct PropertyFlags {
411 bool operator ==(const PropertyFlags &o) {
412 return (combiningClass == o.combiningClass
413 && category == o.category
414 && direction == o.direction
415 && joining == o.joining
417 && digitValue == o.digitValue
418 && line_break_class == o.line_break_class
419 && mirrorDiff == o.mirrorDiff
420 && lowerCaseDiff == o.lowerCaseDiff
421 && upperCaseDiff == o.upperCaseDiff
422 && titleCaseDiff == o.titleCaseDiff
423 && caseFoldDiff == o.caseFoldDiff
424 && lowerCaseSpecial == o.lowerCaseSpecial
425 && upperCaseSpecial == o.upperCaseSpecial
426 && titleCaseSpecial == o.titleCaseSpecial
427 && caseFoldSpecial == o.caseFoldSpecial
428 && graphemeBreak == o.graphemeBreak
429 && wordBreak == o.wordBreak
430 && sentenceBreak == o.sentenceBreak
433 // from UnicodeData.txt
434 uchar combiningClass : 8;
435 QChar::Category category : 5;
436 QChar::Direction direction : 5;
437 // from ArabicShaping.txt
438 QChar::Joining joining : 2;
439 // from DerivedAge.txt
440 QChar::UnicodeVersion age : 4;
442 LineBreakClass line_break_class;
450 bool lowerCaseSpecial;
451 bool upperCaseSpecial;
452 bool titleCaseSpecial;
453 bool caseFoldSpecial;
454 GraphemeBreak graphemeBreak;
456 SentenceBreak sentenceBreak;
460 static QList<int> specialCaseMap;
462 static int appendToSpecialCaseMap(const QList<int> &map)
465 for (int i = 0; i < map.size(); ++i) {
467 if (QChar::requiresSurrogates(val)) {
468 utf16map << QChar::highSurrogate(val);
469 utf16map << QChar::lowSurrogate(val);
474 int length = utf16map.size();
475 utf16map.prepend(length);
477 if (specialCaseMap.isEmpty())
478 specialCaseMap << 0; // placeholder
481 while (i < specialCaseMap.size()) {
482 int n = specialCaseMap.at(i);
485 for (j = 1; j <= n; ++j) {
486 if (specialCaseMap.at(i+j) != utf16map.at(j))
495 int pos = specialCaseMap.size();
496 specialCaseMap << utf16map;
501 UnicodeData(int codepoint = 0) {
502 p.category = QChar::Other_NotAssigned; // Cn
503 p.combiningClass = 0;
505 p.direction = QChar::DirL;
506 // DerivedBidiClass.txt
507 // DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
508 if ((codepoint >= 0x590 && codepoint <= 0x5ff)
509 || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
510 || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
511 || (codepoint >= 0x10800 && codepoint <= 0x10fff)) {
512 p.direction = QChar::DirR;
514 // DirAL for: U+0600..U+07BF, U+FB50..U+FDFF, U+FE70..U+FEFF
515 // minus noncharacter code points (intersects with U+FDD0..U+FDEF)
516 if ((codepoint >= 0x600 && codepoint <= 0x7bf)
517 || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
518 || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
519 || (codepoint >= 0xfe70 && codepoint <= 0xfeff)) {
520 p.direction = QChar::DirAL;
524 decompositionType = QChar::NoDecomposition;
525 p.joining = QChar::OtherJoining;
526 p.age = QChar::Unicode_Unassigned;
529 p.line_break_class = LineBreak_AL; // XX -> AL
534 p.lowerCaseSpecial = 0;
535 p.upperCaseSpecial = 0;
536 p.titleCaseSpecial = 0;
537 p.caseFoldSpecial = 0;
538 p.graphemeBreak = GraphemeBreakOther;
539 p.wordBreak = WordBreakOther;
540 p.sentenceBreak = SentenceBreakOther;
542 excludedComposition = false;
546 // from UnicodeData.txt
547 QChar::Decomposition decompositionType;
548 QList<int> decomposition;
550 QList<int> specialFolding;
552 // from BidiMirroring.txt
555 // DerivedNormalizationProps.txt
556 bool excludedComposition;
558 // computed position of unicode property set
569 UD_DecimalDigitValue,
581 static QHash<QByteArray, QChar::Category> categoryMap;
583 static void initCategoryMap()
589 { QChar::Mark_NonSpacing, "Mn" },
590 { QChar::Mark_SpacingCombining, "Mc" },
591 { QChar::Mark_Enclosing, "Me" },
593 { QChar::Number_DecimalDigit, "Nd" },
594 { QChar::Number_Letter, "Nl" },
595 { QChar::Number_Other, "No" },
597 { QChar::Separator_Space, "Zs" },
598 { QChar::Separator_Line, "Zl" },
599 { QChar::Separator_Paragraph, "Zp" },
601 { QChar::Other_Control, "Cc" },
602 { QChar::Other_Format, "Cf" },
603 { QChar::Other_Surrogate, "Cs" },
604 { QChar::Other_PrivateUse, "Co" },
605 { QChar::Other_NotAssigned, "Cn" },
607 { QChar::Letter_Uppercase, "Lu" },
608 { QChar::Letter_Lowercase, "Ll" },
609 { QChar::Letter_Titlecase, "Lt" },
610 { QChar::Letter_Modifier, "Lm" },
611 { QChar::Letter_Other, "Lo" },
613 { QChar::Punctuation_Connector, "Pc" },
614 { QChar::Punctuation_Dash, "Pd" },
615 { QChar::Punctuation_Open, "Ps" },
616 { QChar::Punctuation_Close, "Pe" },
617 { QChar::Punctuation_InitialQuote, "Pi" },
618 { QChar::Punctuation_FinalQuote, "Pf" },
619 { QChar::Punctuation_Other, "Po" },
621 { QChar::Symbol_Math, "Sm" },
622 { QChar::Symbol_Currency, "Sc" },
623 { QChar::Symbol_Modifier, "Sk" },
624 { QChar::Symbol_Other, "So" },
625 { QChar::Other_NotAssigned, 0 }
629 categoryMap.insert(c->name, c->cat);
635 static QHash<QByteArray, QChar::Direction> directionMap;
637 static void initDirectionMap()
640 QChar::Direction dir;
643 { QChar::DirL, "L" },
644 { QChar::DirR, "R" },
645 { QChar::DirEN, "EN" },
646 { QChar::DirES, "ES" },
647 { QChar::DirET, "ET" },
648 { QChar::DirAN, "AN" },
649 { QChar::DirCS, "CS" },
650 { QChar::DirB, "B" },
651 { QChar::DirS, "S" },
652 { QChar::DirWS, "WS" },
653 { QChar::DirON, "ON" },
654 { QChar::DirLRE, "LRE" },
655 { QChar::DirLRO, "LRO" },
656 { QChar::DirAL, "AL" },
657 { QChar::DirRLE, "RLE" },
658 { QChar::DirRLO, "RLO" },
659 { QChar::DirPDF, "PDF" },
660 { QChar::DirNSM, "NSM" },
661 { QChar::DirBN, "BN" },
666 directionMap.insert(d->name, d->dir);
672 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
674 static void initDecompositionMap()
677 QChar::Decomposition dec;
679 } decompositions[] = {
680 { QChar::Canonical, "<canonical>" },
681 { QChar::Font, "<font>" },
682 { QChar::NoBreak, "<noBreak>" },
683 { QChar::Initial, "<initial>" },
684 { QChar::Medial, "<medial>" },
685 { QChar::Final, "<final>" },
686 { QChar::Isolated, "<isolated>" },
687 { QChar::Circle, "<circle>" },
688 { QChar::Super, "<super>" },
689 { QChar::Sub, "<sub>" },
690 { QChar::Vertical, "<vertical>" },
691 { QChar::Wide, "<wide>" },
692 { QChar::Narrow, "<narrow>" },
693 { QChar::Small, "<small>" },
694 { QChar::Square, "<square>" },
695 { QChar::Compat, "<compat>" },
696 { QChar::Fraction, "<fraction>" },
697 { QChar::NoDecomposition, 0 }
699 Dec *d = decompositions;
701 decompositionMap.insert(d->name, d->dec);
707 static QHash<int, UnicodeData> unicodeData;
708 static QList<PropertyFlags> uniqueProperties;
711 static QHash<int, int> decompositionLength;
712 static int highestComposedCharacter = 0;
713 static int numLigatures = 0;
714 static int highestLigature = 0;
721 // we need them sorted after the first component for fast lookup
722 bool operator < (const Ligature &l1, const Ligature &l2)
723 { return l1.u1 < l2.u1; }
725 static QHash<int, QList<Ligature> > ligatureHashes;
727 static QHash<int, int> combiningClassUsage;
729 static int maxLowerCaseDiff = 0;
730 static int maxUpperCaseDiff = 0;
731 static int maxTitleCaseDiff = 0;
733 static void readUnicodeData()
735 qDebug() << "Reading UnicodeData.txt";
736 QFile f("data/UnicodeData.txt");
738 qFatal("Couldn't find UnicodeData.txt");
740 f.open(QFile::ReadOnly);
745 int len = f.readLine(line.data(), 1024);
746 line.truncate(len-1);
748 int comment = line.indexOf('#');
750 line = line.left(comment);
754 QList<QByteArray> properties = line.split(';');
756 int codepoint = properties[UD_Value].toInt(&ok, 16);
758 Q_ASSERT(codepoint <= LAST_CODEPOINT);
759 int lastCodepoint = codepoint;
761 QByteArray name = properties[UD_Name];
762 if (name.startsWith('<') && name.contains("First")) {
764 nextLine.resize(1024);
765 f.readLine(nextLine.data(), 1024);
766 QList<QByteArray> properties = nextLine.split(';');
767 Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
768 lastCodepoint = properties[UD_Value].toInt(&ok, 16);
770 Q_ASSERT(lastCodepoint <= LAST_CODEPOINT);
773 UnicodeData data(codepoint);
774 data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
775 data.p.combiningClass = properties[UD_CombiningClass].toInt();
776 if (!combiningClassUsage.contains(data.p.combiningClass))
777 combiningClassUsage[data.p.combiningClass] = 1;
779 ++combiningClassUsage[data.p.combiningClass];
781 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
783 if (!properties[UD_UpperCase].isEmpty()) {
784 int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
786 int diff = upperCase - codepoint;
787 if (qAbs(diff) >= (1<<14)) {
788 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
789 // if the condition below doesn't hold anymore we need to modify our special upper casing code in qchar.cpp
790 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(upperCase));
791 data.p.upperCaseSpecial = true;
792 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
794 data.p.upperCaseDiff = diff;
795 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
797 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(upperCase)) {
798 // if the conditions below doesn't hold anymore we need to modify our upper casing code
799 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
800 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
803 if (!properties[UD_LowerCase].isEmpty()) {
804 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
806 int diff = lowerCase - codepoint;
807 if (qAbs(diff) >= (1<<14)) {
808 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
809 // if the condition below doesn't hold anymore we need to modify our special lower casing code in qchar.cpp
810 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(lowerCase));
811 data.p.lowerCaseSpecial = true;
812 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
814 data.p.lowerCaseDiff = diff;
815 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
817 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(lowerCase)) {
818 // if the conditions below doesn't hold anymore we need to modify our lower casing code
819 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
820 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
823 // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
824 if (properties[UD_TitleCase].isEmpty())
825 properties[UD_TitleCase] = properties[UD_UpperCase];
826 if (!properties[UD_TitleCase].isEmpty()) {
827 int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
829 int diff = titleCase - codepoint;
830 if (qAbs(diff) >= (1<<14)) {
831 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
832 // if the condition below doesn't hold anymore we need to modify our special title casing code in qchar.cpp
833 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(titleCase));
834 data.p.titleCaseSpecial = true;
835 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
837 data.p.titleCaseDiff = diff;
838 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
840 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(titleCase)) {
841 // if the conditions below doesn't hold anymore we need to modify our title casing code
842 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
843 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
847 if (!properties[UD_DigitValue].isEmpty())
848 data.p.digitValue = properties[UD_DigitValue].toInt();
851 QByteArray decomposition = properties[UD_Decomposition];
852 if (!decomposition.isEmpty()) {
853 highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
854 QList<QByteArray> d = decomposition.split(' ');
855 if (d[0].contains('<')) {
856 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
857 if (data.decompositionType == QChar::NoDecomposition)
858 qFatal("unassigned decomposition type: %s", d[0].constData());
861 data.decompositionType = QChar::Canonical;
863 for (int i = 0; i < d.size(); ++i) {
864 data.decomposition.append(d[i].toInt(&ok, 16));
867 ++decompositionLength[data.decomposition.size()];
870 for (int i = codepoint; i <= lastCodepoint; ++i)
871 unicodeData.insert(i, data);
876 static int maxMirroredDiff = 0;
878 static void readBidiMirroring()
880 qDebug() << "Reading BidiMirroring.txt";
881 QFile f("data/BidiMirroring.txt");
883 qFatal("Couldn't find BidiMirroring.txt");
885 f.open(QFile::ReadOnly);
890 int len = f.readLine(line.data(), 1024);
893 int comment = line.indexOf('#');
895 line = line.left(comment);
899 line = line.replace(" ", "");
901 QList<QByteArray> pair = line.split(';');
902 Q_ASSERT(pair.size() == 2);
905 int codepoint = pair[0].toInt(&ok, 16);
907 int mirror = pair[1].toInt(&ok, 16);
910 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
911 d.mirroredChar = mirror;
912 d.p.mirrorDiff = d.mirroredChar - codepoint;
913 maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
914 unicodeData.insert(codepoint, d);
918 static void readArabicShaping()
920 qDebug() << "Reading ArabicShaping.txt";
921 QFile f("data/ArabicShaping.txt");
923 qFatal("Couldn't find ArabicShaping.txt");
925 f.open(QFile::ReadOnly);
930 int len = f.readLine(line.data(), 1024);
933 int comment = line.indexOf('#');
935 line = line.left(comment);
936 line = line.trimmed();
941 QList<QByteArray> l = line.split(';');
942 Q_ASSERT(l.size() == 4);
945 int codepoint = l[0].toInt(&ok, 16);
948 Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
949 if (joining == Joining_Unassigned)
950 qFatal("unassigned or unhandled joining value: %s", l[2].constData());
952 if (joining == Joining_Left) {
953 // There are currently no characters of joining type Left_Joining defined in Unicode.
954 qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
957 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
958 if (joining == Joining_Right)
959 d.p.joining = QChar::Right;
960 else if (joining == Joining_Dual)
961 d.p.joining = QChar::Dual;
962 else if (joining == Joining_Causing)
963 d.p.joining = QChar::Center;
965 d.p.joining = QChar::OtherJoining;
966 unicodeData.insert(codepoint, d);
970 static void readDerivedAge()
972 qDebug() << "Reading DerivedAge.txt";
973 QFile f("data/DerivedAge.txt");
975 qFatal("Couldn't find DerivedAge.txt");
977 f.open(QFile::ReadOnly);
982 int len = f.readLine(line.data(), 1024);
985 int comment = line.indexOf('#');
987 line = line.left(comment);
988 line.replace(" ", "");
993 QList<QByteArray> l = line.split(';');
994 Q_ASSERT(l.size() == 2);
996 QByteArray codes = l[0];
997 codes.replace("..", ".");
998 QList<QByteArray> cl = codes.split('.');
1001 int from = cl[0].toInt(&ok, 16);
1004 if (cl.size() == 2) {
1005 to = cl[1].toInt(&ok, 16);
1009 QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1010 //qDebug() << hex << from << ".." << to << ba << age;
1011 if (age == QChar::Unicode_Unassigned)
1012 qFatal("unassigned or unhandled age value: %s", l[1].constData());
1014 for (int codepoint = from; codepoint <= to; ++codepoint) {
1015 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1017 unicodeData.insert(codepoint, d);
1023 static void readDerivedNormalizationProps()
1025 qDebug() << "Reading DerivedNormalizationProps.txt";
1026 QFile f("data/DerivedNormalizationProps.txt");
1028 qFatal("Couldn't find DerivedNormalizationProps.txt");
1030 f.open(QFile::ReadOnly);
1032 while (!f.atEnd()) {
1035 int len = f.readLine(line.data(), 1024);
1038 int comment = line.indexOf('#');
1040 line = line.left(comment);
1042 if (line.trimmed().isEmpty())
1045 QList<QByteArray> l = line.split(';');
1046 Q_ASSERT(l.size() >= 2);
1048 QByteArray propName = l[1].trimmed();
1049 if (propName != "Full_Composition_Exclusion")
1053 QByteArray codes = l[0].trimmed();
1054 codes.replace("..", ".");
1055 QList<QByteArray> cl = codes.split('.');
1058 int from = cl[0].toInt(&ok, 16);
1061 if (cl.size() == 2) {
1062 to = cl[1].toInt(&ok, 16);
1066 for (int codepoint = from; codepoint <= to; ++codepoint) {
1067 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1068 d.excludedComposition = true;
1069 unicodeData.insert(codepoint, d);
1073 for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) {
1074 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1075 if (!d.excludedComposition
1076 && d.decompositionType == QChar::Canonical
1077 && d.decomposition.size() > 1) {
1078 Q_ASSERT(d.decomposition.size() == 2);
1080 int part1 = d.decomposition.at(0);
1081 int part2 = d.decomposition.at(1);
1083 // all non-starters are listed in DerivedNormalizationProps.txt
1084 // and already excluded from composition
1085 Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0);
1088 highestLigature = qMax(highestLigature, part1);
1089 Ligature l = { part1, part2, codepoint };
1090 ligatureHashes[part2].append(l);
1096 struct NormalizationCorrection {
1102 static QByteArray createNormalizationCorrections()
1104 qDebug() << "Reading NormalizationCorrections.txt";
1105 QFile f("data/NormalizationCorrections.txt");
1107 qFatal("Couldn't find NormalizationCorrections.txt");
1109 f.open(QFile::ReadOnly);
1113 out += "struct NormalizationCorrection {\n"
1115 " uint old_mapping;\n"
1119 "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1122 int numCorrections = 0;
1123 while (!f.atEnd()) {
1126 int len = f.readLine(line.data(), 1024);
1129 int comment = line.indexOf('#');
1131 line = line.left(comment);
1132 line.replace(" ", "");
1137 Q_ASSERT(!line.contains(".."));
1139 QList<QByteArray> fields = line.split(';');
1140 Q_ASSERT(fields.size() == 4);
1142 NormalizationCorrection c = { 0, 0, 0 };
1144 c.codepoint = fields.at(0).toInt(&ok, 16);
1146 c.mapped = fields.at(1).toInt(&ok, 16);
1148 if (fields.at(3) == "3.2.0")
1149 c.version = QChar::Unicode_3_2;
1150 else if (fields.at(3) == "4.0.0")
1151 c.version = QChar::Unicode_4_0;
1153 qFatal("unknown unicode version in NormalizationCorrection.txt");
1155 out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1156 + ", " + QString::number(c.version) + " },\n";
1158 maxVersion = qMax(c.version, maxVersion);
1163 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1164 "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1170 static void computeUniqueProperties()
1172 qDebug("computeUniqueProperties:");
1173 for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) {
1174 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1176 int index = uniqueProperties.indexOf(d.p);
1178 index = uniqueProperties.size();
1179 uniqueProperties.append(d.p);
1181 d.propertyIndex = index;
1182 unicodeData.insert(uc, d);
1184 qDebug(" %d unique unicode properties found", uniqueProperties.size());
1188 static void readLineBreak()
1190 qDebug() << "Reading LineBreak.txt";
1191 QFile f("data/LineBreak.txt");
1193 qFatal("Couldn't find LineBreak.txt");
1195 f.open(QFile::ReadOnly);
1197 while (!f.atEnd()) {
1200 int len = f.readLine(line.data(), 1024);
1203 int comment = line.indexOf('#');
1205 line = line.left(comment);
1206 line.replace(" ", "");
1211 QList<QByteArray> l = line.split(';');
1212 Q_ASSERT(l.size() == 2);
1214 QByteArray codes = l[0];
1215 codes.replace("..", ".");
1216 QList<QByteArray> cl = codes.split('.');
1219 int from = cl[0].toInt(&ok, 16);
1222 if (cl.size() == 2) {
1223 to = cl[1].toInt(&ok, 16);
1227 LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1228 if (lb == LineBreak_Unassigned)
1229 qFatal("unassigned line break class: %s", l[1].constData());
1231 for (int codepoint = from; codepoint <= to; ++codepoint) {
1232 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1233 d.p.line_break_class = lb;
1234 unicodeData.insert(codepoint, d);
1240 static void readSpecialCasing()
1242 qDebug() << "Reading SpecialCasing.txt";
1243 QFile f("data/SpecialCasing.txt");
1245 qFatal("Couldn't find SpecialCasing.txt");
1247 f.open(QFile::ReadOnly);
1249 while (!f.atEnd()) {
1252 int len = f.readLine(line.data(), 1024);
1255 int comment = line.indexOf('#');
1257 line = line.left(comment);
1262 QList<QByteArray> l = line.split(';');
1264 QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1265 if (!condition.isEmpty())
1270 int codepoint = l[0].trimmed().toInt(&ok, 16);
1273 // if the condition below doesn't hold anymore we need to modify our
1274 // lower/upper/title casing code and case folding code
1275 Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1277 // qDebug() << "codepoint" << hex << codepoint;
1278 // qDebug() << line;
1280 QList<QByteArray> lower = l[1].trimmed().split(' ');
1281 QList<int> lowerMap;
1282 for (int i = 0; i < lower.size(); ++i) {
1284 lowerMap.append(lower.at(i).toInt(&ok, 16));
1288 QList<QByteArray> title = l[2].trimmed().split(' ');
1289 QList<int> titleMap;
1290 for (int i = 0; i < title.size(); ++i) {
1292 titleMap.append(title.at(i).toInt(&ok, 16));
1296 QList<QByteArray> upper = l[3].trimmed().split(' ');
1297 QList<int> upperMap;
1298 for (int i = 0; i < upper.size(); ++i) {
1300 upperMap.append(upper.at(i).toInt(&ok, 16));
1305 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1307 Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1308 Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1309 Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1311 if (lowerMap.size() > 1) {
1312 ud.p.lowerCaseSpecial = true;
1313 ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1315 if (titleMap.size() > 1) {
1316 ud.p.titleCaseSpecial = true;
1317 ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1319 if (upperMap.size() > 1) {
1320 ud.p.upperCaseSpecial = true;
1321 ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1324 unicodeData.insert(codepoint, ud);
1328 static int maxCaseFoldDiff = 0;
1330 static void readCaseFolding()
1332 qDebug() << "Reading CaseFolding.txt";
1333 QFile f("data/CaseFolding.txt");
1335 qFatal("Couldn't find CaseFolding.txt");
1337 f.open(QFile::ReadOnly);
1339 while (!f.atEnd()) {
1342 int len = f.readLine(line.data(), 1024);
1345 int comment = line.indexOf('#');
1347 line = line.left(comment);
1352 QList<QByteArray> l = line.split(';');
1355 int codepoint = l[0].trimmed().toInt(&ok, 16);
1359 l[1] = l[1].trimmed();
1360 if (l[1] == "F" || l[1] == "T")
1363 // qDebug() << "codepoint" << hex << codepoint;
1364 // qDebug() << line;
1365 QList<QByteArray> fold = l[2].trimmed().split(' ');
1367 for (int i = 0; i < fold.size(); ++i) {
1369 foldMap.append(fold.at(i).toInt(&ok, 16));
1373 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1374 if (foldMap.size() == 1) {
1375 int caseFolded = foldMap.at(0);
1376 int diff = caseFolded - codepoint;
1377 if (qAbs(diff) >= (1<<14)) {
1378 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1379 // if the condition below doesn't hold anymore we need to modify our special case folding code in qchar.cpp
1380 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(caseFolded));
1381 ud.p.caseFoldSpecial = true;
1382 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1384 ud.p.caseFoldDiff = diff;
1385 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1387 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(caseFolded)) {
1388 // if the conditions below doesn't hold anymore we need to modify our case folding code
1389 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1390 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1392 // if (caseFolded != codepoint + ud.p.lowerCaseDiff)
1393 // qDebug() << hex << codepoint;
1395 qFatal("we currently don't support full case foldings");
1396 // qDebug() << "special" << hex << foldMap;
1397 ud.p.caseFoldSpecial = true;
1398 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1400 unicodeData.insert(codepoint, ud);
1404 static void readGraphemeBreak()
1406 qDebug() << "Reading GraphemeBreakProperty.txt";
1407 QFile f("data/GraphemeBreakProperty.txt");
1409 qFatal("Couldn't find GraphemeBreakProperty.txt");
1411 f.open(QFile::ReadOnly);
1413 while (!f.atEnd()) {
1416 int len = f.readLine(line.data(), 1024);
1419 int comment = line.indexOf('#');
1421 line = line.left(comment);
1422 line.replace(" ", "");
1427 QList<QByteArray> l = line.split(';');
1428 Q_ASSERT(l.size() == 2);
1430 QByteArray codes = l[0];
1431 codes.replace("..", ".");
1432 QList<QByteArray> cl = codes.split('.');
1435 int from = cl[0].toInt(&ok, 16);
1438 if (cl.size() == 2) {
1439 to = cl[1].toInt(&ok, 16);
1443 GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1444 if (brk == GraphemeBreak_Unassigned)
1445 qFatal("unassigned grapheme break class: %s", l[1].constData());
1447 for (int codepoint = from; codepoint <= to; ++codepoint) {
1448 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1449 ud.p.graphemeBreak = brk;
1450 unicodeData.insert(codepoint, ud);
1455 static void readWordBreak()
1457 qDebug() << "Reading WordBreakProperty.txt";
1458 QFile f("data/WordBreakProperty.txt");
1460 qFatal("Couldn't find WordBreakProperty.txt");
1462 f.open(QFile::ReadOnly);
1464 while (!f.atEnd()) {
1467 int len = f.readLine(line.data(), 1024);
1470 int comment = line.indexOf('#');
1472 line = line.left(comment);
1473 line.replace(" ", "");
1478 QList<QByteArray> l = line.split(';');
1479 Q_ASSERT(l.size() == 2);
1481 QByteArray codes = l[0];
1482 codes.replace("..", ".");
1483 QList<QByteArray> cl = codes.split('.');
1486 int from = cl[0].toInt(&ok, 16);
1489 if (cl.size() == 2) {
1490 to = cl[1].toInt(&ok, 16);
1494 WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned);
1495 if (brk == WordBreak_Unassigned)
1496 qFatal("unassigned word break class: %s", l[1].constData());
1498 for (int codepoint = from; codepoint <= to; ++codepoint) {
1499 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1500 ud.p.wordBreak = brk;
1501 unicodeData.insert(codepoint, ud);
1506 static void readSentenceBreak()
1508 qDebug() << "Reading SentenceBreakProperty.txt";
1509 QFile f("data/SentenceBreakProperty.txt");
1511 qFatal("Couldn't find SentenceBreakProperty.txt");
1513 f.open(QFile::ReadOnly);
1515 while (!f.atEnd()) {
1518 int len = f.readLine(line.data(), 1024);
1521 int comment = line.indexOf('#');
1523 line = line.left(comment);
1524 line.replace(" ", "");
1529 QList<QByteArray> l = line.split(';');
1530 Q_ASSERT(l.size() == 2);
1532 QByteArray codes = l[0];
1533 codes.replace("..", ".");
1534 QList<QByteArray> cl = codes.split('.');
1537 int from = cl[0].toInt(&ok, 16);
1540 if (cl.size() == 2) {
1541 to = cl[1].toInt(&ok, 16);
1545 SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
1546 if (brk == SentenceBreak_Unassigned)
1547 qFatal("unassigned sentence break class: %s", l[1].constData());
1549 for (int codepoint = from; codepoint <= to; ++codepoint) {
1550 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1551 ud.p.sentenceBreak = brk;
1552 unicodeData.insert(codepoint, ud);
1558 // this piece of code does full case folding and comparison. We currently
1559 // don't use it, since this gives lots of issues with things as case insensitive
1560 // search and replace.
1561 static inline void foldCase(uint ch, ushort *out)
1563 const QUnicodeTables::Properties *p = qGetProp(ch);
1564 if (!p->caseFoldSpecial) {
1565 *(out++) = ch + p->caseFoldDiff;
1567 const ushort *folded = specialCaseMap + p->caseFoldDiff;
1568 ushort length = *folded++;
1575 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1584 while (a != ae && b != be) {
1585 const QUnicodeTables::Properties *pa = qGetProp(*a);
1586 const QUnicodeTables::Properties *pb = qGetProp(*b);
1587 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1589 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1603 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1604 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1605 abuf[0] = bbuf[0] = 0;
1611 if (!*bp && b == be)
1615 foldCase(*(a++), abuf);
1621 foldCase(*(b++), bbuf);
1625 return (int)*ap - (int)*bp;
1632 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1639 while (a != ae && *b) {
1640 const QUnicodeTables::Properties *pa = qGetProp(*a);
1641 const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1642 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1644 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1658 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1659 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1660 abuf[0] = bbuf[0] = 0;
1670 foldCase(*(a++), abuf);
1676 foldCase(*(b++), bbuf);
1680 return (int)*ap - (int)*bp;
1688 static QList<QByteArray> blockNames;
1695 static QList<BlockInfo> blockInfoList;
1697 static void readBlocks()
1699 qDebug() << "Reading Blocks.txt";
1700 QFile f("data/Blocks.txt");
1702 qFatal("Couldn't find Blocks.txt");
1704 f.open(QFile::ReadOnly);
1706 while (!f.atEnd()) {
1707 QByteArray line = f.readLine();
1708 line.resize(line.size() - 1);
1710 int comment = line.indexOf("#");
1712 line = line.left(comment);
1714 line.replace(" ", "");
1719 int semicolon = line.indexOf(';');
1720 Q_ASSERT(semicolon >= 0);
1721 QByteArray codePoints = line.left(semicolon);
1722 QByteArray blockName = line.mid(semicolon + 1);
1724 int blockIndex = blockNames.indexOf(blockName);
1725 if (blockIndex == -1) {
1726 blockIndex = blockNames.size();
1727 blockNames.append(blockName);
1730 codePoints.replace("..", ".");
1731 QList<QByteArray> cl = codePoints.split('.');
1734 int first = cl[0].toInt(&ok, 16);
1737 if (cl.size() == 2) {
1738 last = cl[1].toInt(&ok, 16);
1742 BlockInfo blockInfo = { blockIndex, first, last };
1743 blockInfoList.append(blockInfo);
1748 static QList<QByteArray> scriptNames;
1749 static QHash<int, int> scriptAssignment;
1750 static QHash<int, int> scriptHash;
1754 QVector<int> vector;
1757 static QList<ExtraBlock> extraBlockList;
1760 static void readScripts()
1762 scriptNames.append("Common");
1764 static const char *files[] = {
1765 "data/ScriptsInitial.txt",
1767 "data/ScriptsCorrections.txt"
1769 enum { fileCount = sizeof(files) / sizeof(const char *) };
1771 for (int i = 0; i < fileCount; ++i) {
1774 qFatal("Couldn't find %s", files[i]);
1776 f.open(QFile::ReadOnly);
1778 while (!f.atEnd()) {
1779 QByteArray line = f.readLine();
1780 line.resize(line.size() - 1);
1782 int comment = line.indexOf("#");
1784 line = line.left(comment);
1786 line.replace(" ", "");
1787 line.replace("_", "");
1792 int semicolon = line.indexOf(';');
1793 Q_ASSERT(semicolon >= 0);
1794 QByteArray codePoints = line.left(semicolon);
1795 QByteArray scriptName = line.mid(semicolon + 1);
1797 int scriptIndex = scriptNames.indexOf(scriptName);
1798 if (scriptIndex == -1) {
1799 scriptIndex = scriptNames.size();
1800 scriptNames.append(scriptName);
1803 codePoints.replace("..", ".");
1804 QList<QByteArray> cl = codePoints.split('.');
1807 int first = cl[0].toInt(&ok, 16);
1810 if (cl.size() == 2) {
1811 last = cl[1].toInt(&ok, 16);
1815 for (int i = first; i <= last; ++i)
1816 scriptAssignment[i] = scriptIndex;
1822 static int scriptSentinel = 0;
1824 QByteArray createScriptEnumDeclaration()
1826 static const char *specialScripts[] = {
1857 const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
1859 // generate script enum
1860 QByteArray declaration;
1862 declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n";
1863 declaration += " enum Script {\n Common";
1865 int uniqueScripts = 1; // Common
1867 // output the ones with special processing first
1868 for (int i = 1; i < scriptNames.size(); ++i) {
1869 QByteArray scriptName = scriptNames.at(i);
1870 // does the script require special processing?
1871 bool special = false;
1872 for (int s = 0; s < specialScriptsCount; ++s) {
1873 if (scriptName == specialScripts[s]) {
1879 scriptHash[i] = 0; // alias for 'Common'
1886 if (scriptName != "Inherited") {
1887 declaration += ",\n ";
1888 declaration += scriptName;
1891 declaration += ",\n Inherited";
1892 declaration += ",\n ScriptCount = Inherited";
1894 // output the ones that are an alias for 'Common'
1895 for (int i = 1; i < scriptNames.size(); ++i) {
1896 if (scriptHash.value(i) != 0)
1898 declaration += ",\n ";
1899 declaration += scriptNames.at(i);
1900 declaration += " = Common";
1903 declaration += "\n };\n";
1905 scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
1906 declaration += " enum { ScriptSentinel = ";
1907 declaration += QByteArray::number(scriptSentinel);
1908 declaration += " };\n\n";
1912 QByteArray createScriptTableDeclaration()
1914 Q_ASSERT(scriptSentinel > 0);
1916 QByteArray declaration;
1918 const int unicodeBlockCount = 512; // number of unicode blocks
1919 const int unicodeBlockSize = 128; // size of each block
1920 declaration = "enum { UnicodeBlockCount = ";
1921 declaration += QByteArray::number(unicodeBlockCount);
1922 declaration += " }; // number of unicode blocks\n";
1923 declaration += "enum { UnicodeBlockSize = ";
1924 declaration += QByteArray::number(unicodeBlockSize);
1925 declaration += " }; // size of each block\n\n";
1928 declaration += "static const unsigned char uc_scripts[] = {\n";
1929 for (int i = 0; i < unicodeBlockCount; ++i) {
1930 int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
1931 int blockAssignment[unicodeBlockSize];
1932 for (int x = 0; x < unicodeBlockSize; ++x) {
1933 int codePoint = (i << 7) | x;
1934 blockAssignment[x] = scriptAssignment.value(codePoint, 0);
1936 bool allTheSame = true;
1937 const int originalScript = blockAssignment[0];
1938 const int script = scriptHash.value(originalScript);
1939 for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
1940 const int s = scriptHash.value(blockAssignment[x]);
1947 declaration += scriptNames.value(originalScript);
1948 declaration += ", /* U+";
1949 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1951 declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1952 declaration += " */\n";
1954 const int value = extraBlockList.size() + scriptSentinel;
1955 const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1958 declaration += QByteArray::number(value);
1959 declaration += ", /* U+";
1960 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1962 declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1963 declaration += " at offset ";
1964 declaration += QByteArray::number(offset);
1965 declaration += " */\n";
1967 ExtraBlock extraBlock;
1968 extraBlock.block = block;
1969 extraBlock.vector.resize(unicodeBlockSize);
1970 for (int x = 0; x < unicodeBlockSize; ++x)
1971 extraBlock.vector[x] = blockAssignment[x];
1973 extraBlockList.append(extraBlock);
1977 for (int i = 0; i < extraBlockList.size(); ++i) {
1978 const int value = i + scriptSentinel;
1979 const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1980 const ExtraBlock &extraBlock = extraBlockList.at(i);
1981 const int block = extraBlock.block;
1983 declaration += "\n\n /* U+";
1984 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1986 declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1987 declaration += " at offset ";
1988 declaration += QByteArray::number(offset);
1989 declaration += " */\n ";
1991 for (int x = 0; x < extraBlock.vector.size(); ++x) {
1992 const int o = extraBlock.vector.at(x);
1994 declaration += scriptNames.value(o);
1995 if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
1997 if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
1998 declaration += "\n ";
2002 if (declaration.endsWith(' '))
2003 declaration.chop(1);
2005 declaration += "\n};\n\n";
2008 "Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4)\n"
2010 " if (ucs4 > 0xffff)\n"
2012 " int script = uc_scripts[ucs4 >> 7];\n"
2013 " if (script < ScriptSentinel)\n"
2015 " script = (((script - ScriptSentinel) * UnicodeBlockSize) + UnicodeBlockCount);\n"
2016 " script = uc_scripts[script + (ucs4 & 0x7f)];\n"
2020 qDebug("createScriptTableDeclaration:");
2021 qDebug(" memory usage: %d bytes", unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
2027 static void dump(int from, int to)
2029 for (int i = from; i <= to; ++i) {
2030 UnicodeData d = unicodeData.value(i, UnicodeData(i));
2031 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
2032 i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
2033 if (d.decompositionType != QChar::NoDecomposition) {
2034 qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
2035 d.decomposition[0]);
2042 struct PropertyBlock {
2043 PropertyBlock() { index = -1; }
2045 QList<int> properties;
2046 bool operator==(const PropertyBlock &other)
2047 { return properties == other.properties; }
2050 static QByteArray createPropertyInfo()
2052 qDebug("createPropertyInfo:");
2054 // we reserve one bit more than in the assert below for the sign
2055 Q_ASSERT(maxMirroredDiff < (1<<12));
2056 Q_ASSERT(maxLowerCaseDiff < (1<<14));
2057 Q_ASSERT(maxUpperCaseDiff < (1<<14));
2058 Q_ASSERT(maxTitleCaseDiff < (1<<14));
2059 Q_ASSERT(maxCaseFoldDiff < (1<<14));
2061 const int BMP_BLOCKSIZE = 32;
2062 const int BMP_SHIFT = 5;
2063 const int BMP_END = 0x11000;
2064 const int SMP_END = 0x110000;
2065 const int SMP_BLOCKSIZE = 256;
2066 const int SMP_SHIFT = 8;
2068 QList<PropertyBlock> blocks;
2069 QList<int> blockMap;
2073 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2075 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2076 int uc = block*BMP_BLOCKSIZE + i;
2077 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2078 b.properties.append(d.propertyIndex);
2080 int index = blocks.indexOf(b);
2082 index = blocks.size();
2084 used += BMP_BLOCKSIZE;
2087 blockMap.append(blocks.at(index).index);
2090 int bmp_blocks = blocks.size();
2091 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2093 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2095 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2096 int uc = block*SMP_BLOCKSIZE + i;
2097 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2098 b.properties.append(d.propertyIndex);
2100 int index = blocks.indexOf(b);
2102 index = blocks.size();
2104 used += SMP_BLOCKSIZE;
2107 blockMap.append(blocks.at(index).index);
2110 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2111 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2112 int bmp_mem = bmp_block_data + bmp_trie;
2113 qDebug(" %d unique blocks in BMP.", blocks.size());
2114 qDebug(" block data uses: %d bytes", bmp_block_data);
2115 qDebug(" trie data uses : %d bytes", bmp_trie);
2117 int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2118 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2119 int smp_mem = smp_block_data + smp_trie;
2120 qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2121 qDebug(" block data uses: %d bytes", smp_block_data);
2122 qDebug(" trie data uses : %d bytes", smp_trie);
2124 qDebug("\n properties uses : %d bytes", uniqueProperties.size() * SizeOfPropertiesStruct);
2125 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + uniqueProperties.size() * SizeOfPropertiesStruct);
2128 out += "static const unsigned short uc_property_trie[] = {\n";
2130 // first write the map
2131 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2132 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2134 if (out.endsWith(' '))
2136 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2140 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2143 if (out.endsWith(' '))
2145 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2146 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2148 if (out.endsWith(' '))
2150 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2154 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2157 if (out.endsWith(' '))
2161 for (int i = 0; i < blocks.size(); ++i) {
2162 if (out.endsWith(' '))
2165 const PropertyBlock &b = blocks.at(i);
2166 for (int j = 0; j < b.properties.size(); ++j) {
2168 if (out.endsWith(' '))
2172 out += QByteArray::number(b.properties.at(j));
2176 if (out.endsWith(' '))
2180 "#define GET_PROP_INDEX(ucs4) \\\n"
2181 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2182 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2183 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2184 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2185 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2186 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2187 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2188 " (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2189 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
2191 "static const Properties uc_properties[] = {";
2193 // keep in sync with the property declaration
2194 for (int i = 0; i < uniqueProperties.size(); ++i) {
2195 PropertyFlags p = uniqueProperties.at(i);
2197 // " ushort category : 8;\n"
2198 out += QByteArray::number( p.category );
2200 // " ushort line_break_class : 8;\n"
2201 out += QByteArray::number( p.line_break_class );
2203 // " ushort direction : 8;\n"
2204 out += QByteArray::number( p.direction );
2206 // " ushort combiningClass :8;\n"
2207 out += QByteArray::number( p.combiningClass );
2209 // " ushort joining : 2;\n"
2210 out += QByteArray::number( p.joining );
2212 // " signed short digitValue : 6;\n /* 5 needed */"
2213 out += QByteArray::number( p.digitValue );
2215 // " ushort unicodeVersion : 4;\n"
2216 out += QByteArray::number( p.age );
2218 // " ushort lowerCaseSpecial : 1;\n"
2219 // " ushort upperCaseSpecial : 1;\n"
2220 // " ushort titleCaseSpecial : 1;\n"
2221 // " ushort caseFoldSpecial : 1;\n"
2222 out += QByteArray::number( p.lowerCaseSpecial );
2224 out += QByteArray::number( p.upperCaseSpecial );
2226 out += QByteArray::number( p.titleCaseSpecial );
2228 out += QByteArray::number( p.caseFoldSpecial );
2230 // " signed short mirrorDiff : 16;\n"
2231 // " signed short lowerCaseDiff : 16;\n"
2232 // " signed short upperCaseDiff : 16;\n"
2233 // " signed short titleCaseDiff : 16;\n"
2234 // " signed short caseFoldDiff : 16;\n"
2235 out += QByteArray::number( p.mirrorDiff );
2237 out += QByteArray::number( p.lowerCaseDiff );
2239 out += QByteArray::number( p.upperCaseDiff );
2241 out += QByteArray::number( p.titleCaseDiff );
2243 out += QByteArray::number( p.caseFoldDiff );
2245 out += QByteArray::number( p.graphemeBreak );
2247 out += QByteArray::number( p.wordBreak );
2249 out += QByteArray::number( p.sentenceBreak );
2256 out += "static inline const Properties *qGetProp(uint ucs4)\n"
2258 " const int index = GET_PROP_INDEX(ucs4);\n"
2259 " return uc_properties + index;\n"
2262 "static inline const Properties *qGetProp(ushort ucs2)\n"
2264 " const int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2265 " return uc_properties + index;\n"
2268 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4)\n"
2270 " return qGetProp(ucs4);\n"
2273 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2)\n"
2275 " return qGetProp(ucs2);\n"
2278 out += "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4)\n"
2280 " return (LineBreakClass)qGetProp(ucs4)->line_break_class;\n"
2286 static QByteArray createSpecialCaseMap()
2288 qDebug("createSpecialCaseMap:");
2291 out += "static const ushort specialCaseMap[] = {\n"
2292 " 0x0, // placeholder";
2294 while (i < specialCaseMap.size()) {
2296 int n = specialCaseMap.at(i);
2297 for (int j = 0; j <= n; ++j) {
2298 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2306 qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2);
2312 struct DecompositionBlock {
2313 DecompositionBlock() { index = -1; }
2315 QList<int> decompositionPositions;
2316 bool operator ==(const DecompositionBlock &other)
2317 { return decompositionPositions == other.decompositionPositions; }
2320 static QByteArray createCompositionInfo()
2322 qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2324 const int BMP_BLOCKSIZE = 16;
2325 const int BMP_SHIFT = 4;
2326 const int BMP_END = 0x3400; // start of Han
2327 const int SMP_END = 0x30000;
2328 const int SMP_BLOCKSIZE = 256;
2329 const int SMP_SHIFT = 8;
2331 if (SMP_END <= highestComposedCharacter)
2332 qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2334 QList<DecompositionBlock> blocks;
2335 QList<int> blockMap;
2336 QList<unsigned short> decompositions;
2341 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2342 DecompositionBlock b;
2343 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2344 int uc = block*BMP_BLOCKSIZE + i;
2345 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2346 if (!d.decomposition.isEmpty()) {
2347 int utf16Length = 0;
2348 decompositions.append(0);
2349 for (int j = 0; j < d.decomposition.size(); ++j) {
2350 int code = d.decomposition.at(j);
2351 if (QChar::requiresSurrogates(code)) {
2352 // save as surrogate pair
2353 decompositions.append(QChar::highSurrogate(code));
2354 decompositions.append(QChar::lowSurrogate(code));
2357 decompositions.append(code);
2361 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2362 b.decompositionPositions.append(tableIndex);
2363 tableIndex += utf16Length + 1;
2365 b.decompositionPositions.append(0xffff);
2368 int index = blocks.indexOf(b);
2370 index = blocks.size();
2372 used += BMP_BLOCKSIZE;
2375 blockMap.append(blocks.at(index).index);
2378 int bmp_blocks = blocks.size();
2379 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2381 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2382 DecompositionBlock b;
2383 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2384 int uc = block*SMP_BLOCKSIZE + i;
2385 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2386 if (!d.decomposition.isEmpty()) {
2387 int utf16Length = 0;
2388 decompositions.append(0);
2389 for (int j = 0; j < d.decomposition.size(); ++j) {
2390 int code = d.decomposition.at(j);
2391 if (QChar::requiresSurrogates(code)) {
2392 // save as surrogate pair
2393 decompositions.append(QChar::highSurrogate(code));
2394 decompositions.append(QChar::lowSurrogate(code));
2397 decompositions.append(code);
2401 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2402 b.decompositionPositions.append(tableIndex);
2403 tableIndex += utf16Length + 1;
2405 b.decompositionPositions.append(0xffff);
2408 int index = blocks.indexOf(b);
2410 index = blocks.size();
2412 used += SMP_BLOCKSIZE;
2415 blockMap.append(blocks.at(index).index);
2418 // if the condition below doesn't hold anymore we need to modify our decomposition code
2419 Q_ASSERT(tableIndex < 0xffff);
2421 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2422 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2423 int bmp_mem = bmp_block_data + bmp_trie;
2424 qDebug(" %d unique blocks in BMP.", blocks.size());
2425 qDebug(" block data uses: %d bytes", bmp_block_data);
2426 qDebug(" trie data uses : %d bytes", bmp_trie);
2427 qDebug(" memory usage: %d bytes", bmp_mem);
2429 int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2430 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2431 int smp_mem = smp_block_data + smp_trie;
2432 qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2433 qDebug(" block data uses: %d bytes", smp_block_data);
2434 qDebug(" trie data uses : %d bytes", smp_trie);
2436 qDebug("\n decomposition table uses : %d bytes", decompositions.size()*2);
2437 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
2441 out += "static const unsigned short uc_decomposition_trie[] = {\n";
2443 // first write the map
2444 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2445 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2447 if (out.endsWith(' '))
2449 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2453 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2456 if (out.endsWith(' '))
2458 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2459 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2461 if (out.endsWith(' '))
2463 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2467 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2470 if (out.endsWith(' '))
2474 for (int i = 0; i < blocks.size(); ++i) {
2475 if (out.endsWith(' '))
2478 const DecompositionBlock &b = blocks.at(i);
2479 for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2481 if (out.endsWith(' '))
2485 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2489 if (out.endsWith(' '))
2493 "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2494 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2495 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2496 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2497 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2498 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2499 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2500 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2503 "static const unsigned short uc_decomposition_map[] = {";
2505 for (int i = 0; i < decompositions.size(); ++i) {
2507 if (out.endsWith(' '))
2511 out += "0x" + QByteArray::number(decompositions.at(i), 16);
2514 if (out.endsWith(' '))
2521 static QByteArray createLigatureInfo()
2523 qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2525 foreach (const QList<Ligature> &l, ligatureHashes) {
2526 for (int j = 0; j < l.size(); ++j) {
2527 // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2528 Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2529 QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2533 const int BMP_BLOCKSIZE = 32;
2534 const int BMP_SHIFT = 5;
2535 const int BMP_END = 0x3100;
2536 const int SMP_END = 0x12000;
2537 const int SMP_BLOCKSIZE = 256;
2538 const int SMP_SHIFT = 8;
2540 if (SMP_END <= highestLigature)
2541 qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2543 QList<DecompositionBlock> blocks;
2544 QList<int> blockMap;
2545 QList<unsigned short> ligatures;
2550 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2551 DecompositionBlock b;
2552 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2553 int uc = block*BMP_BLOCKSIZE + i;
2554 QList<Ligature> l = ligatureHashes.value(uc);
2556 Q_ASSERT(!QChar::requiresSurrogates(uc));
2557 qSort(l); // needed for bsearch in ligatureHelper code
2559 ligatures.append(l.size());
2560 for (int j = 0; j < l.size(); ++j) {
2561 ligatures.append(l.at(j).u1);
2562 ligatures.append(l.at(j).ligature);
2564 b.decompositionPositions.append(tableIndex);
2565 tableIndex += 2*l.size() + 1;
2567 b.decompositionPositions.append(0xffff);
2570 int index = blocks.indexOf(b);
2572 index = blocks.size();
2574 used += BMP_BLOCKSIZE;
2577 blockMap.append(blocks.at(index).index);
2580 int bmp_blocks = blocks.size();
2581 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2583 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2584 DecompositionBlock b;
2585 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2586 int uc = block*SMP_BLOCKSIZE + i;
2587 QList<Ligature> l = ligatureHashes.value(uc);
2589 Q_ASSERT(QChar::requiresSurrogates(uc));
2590 qSort(l); // needed for bsearch in ligatureHelper code
2592 ligatures.append(l.size());
2593 for (int j = 0; j < l.size(); ++j) {
2594 ligatures.append(QChar::highSurrogate(l.at(j).u1));
2595 ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2596 ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2597 ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2599 b.decompositionPositions.append(tableIndex);
2600 tableIndex += 4*l.size() + 1;
2602 b.decompositionPositions.append(0xffff);
2605 int index = blocks.indexOf(b);
2607 index = blocks.size();
2609 used += SMP_BLOCKSIZE;
2612 blockMap.append(blocks.at(index).index);
2615 // if the condition below doesn't hold anymore we need to modify our composition code
2616 Q_ASSERT(tableIndex < 0xffff);
2618 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2619 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2620 int bmp_mem = bmp_block_data + bmp_trie;
2621 qDebug(" %d unique blocks in BMP.", blocks.size());
2622 qDebug(" block data uses: %d bytes", bmp_block_data);
2623 qDebug(" trie data uses : %d bytes", bmp_trie);
2624 qDebug(" memory usage: %d bytes", bmp_mem);
2626 int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
2627 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2628 int smp_mem = smp_block_data + smp_trie;
2629 qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks);
2630 qDebug(" block data uses: %d bytes", smp_block_data);
2631 qDebug(" trie data uses : %d bytes", smp_trie);
2633 qDebug("\n ligature data uses : %d bytes", ligatures.size()*2);
2634 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + ligatures.size() * 2);
2638 out += "static const unsigned short uc_ligature_trie[] = {\n";
2640 // first write the map
2641 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2642 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2644 if (out.endsWith(' '))
2646 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2650 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2653 if (out.endsWith(' '))
2655 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2656 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2658 if (out.endsWith(' '))
2660 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2664 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2667 if (out.endsWith(' '))
2671 for (int i = 0; i < blocks.size(); ++i) {
2672 if (out.endsWith(' '))
2675 const DecompositionBlock &b = blocks.at(i);
2676 for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2678 if (out.endsWith(' '))
2682 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2686 if (out.endsWith(' '))
2690 "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2691 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2692 " ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2693 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2694 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2695 " ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2696 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2697 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2700 "static const unsigned short uc_ligature_map[] = {";
2702 for (int i = 0; i < ligatures.size(); ++i) {
2704 if (out.endsWith(' '))
2708 out += "0x" + QByteArray::number(ligatures.at(i), 16);
2711 if (out.endsWith(' '))
2718 QByteArray createCasingInfo()
2722 out += "struct CasingInfo {\n"
2723 " uint codePoint : 16;\n"
2724 " uint flags : 8;\n"
2725 " uint offset : 8;\n"
2732 int main(int, char **)
2736 initDecompositionMap();
2739 initGraphemeBreak();
2741 initSentenceBreak();
2745 readBidiMirroring();
2746 readArabicShaping();
2748 readDerivedNormalizationProps();
2749 readSpecialCasing();
2753 readGraphemeBreak();
2755 readSentenceBreak();
2758 computeUniqueProperties();
2759 QByteArray properties = createPropertyInfo();
2760 QByteArray specialCases = createSpecialCaseMap();
2761 QByteArray compositions = createCompositionInfo();
2762 QByteArray ligatures = createLigatureInfo();
2763 QByteArray normalizationCorrections = createNormalizationCorrections();
2764 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2765 QByteArray scriptTableDeclaration = createScriptTableDeclaration();
2768 "/****************************************************************************\n"
2770 "** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).\n"
2771 "** Contact: http://www.qt-project.org/\n"
2773 "** This file is part of the QtCore module of the Qt Toolkit.\n"
2775 "** $QT_BEGIN_LICENSE:LGPL$\n"
2776 "** GNU Lesser General Public License Usage\n"
2777 "** This file may be used under the terms of the GNU Lesser General Public\n"
2778 "** License version 2.1 as published by the Free Software Foundation and\n"
2779 "** appearing in the file LICENSE.LGPL included in the packaging of this\n"
2780 "** file. Please review the following information to ensure the GNU Lesser\n"
2781 "** General Public License version 2.1 requirements will be met:\n"
2782 "** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2784 "** In addition, as a special exception, Nokia gives you certain additional\n"
2785 "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2786 "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2788 "** GNU General Public License Usage\n"
2789 "** Alternatively, this file may be used under the terms of the GNU General\n"
2790 "** Public License version 3.0 as published by the Free Software Foundation\n"
2791 "** and appearing in the file LICENSE.GPL included in the packaging of this\n"
2792 "** file. Please review the following information to ensure the GNU General\n"
2793 "** Public License version 3.0 requirements will be met:\n"
2794 "** http://www.gnu.org/copyleft/gpl.html.\n"
2797 "** Alternatively, this file may be used in accordance with the terms and\n"
2798 "** conditions contained in a signed written agreement between you and Nokia.\n"
2805 "** $QT_END_LICENSE$\n"
2807 "****************************************************************************/\n\n";
2810 "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
2812 QByteArray warning =
2814 "// W A R N I N G\n"
2815 "// -------------\n"
2817 "// This file is not part of the Qt API. It exists for the convenience\n"
2818 "// of internal files. This header file may change from version to version\n"
2819 "// without notice, or even be removed.\n"
2824 QFile f("../../src/corelib/tools/qunicodetables.cpp");
2825 f.open(QFile::WriteOnly|QFile::Truncate);
2828 f.write("#include \"qunicodetables_p.h\"\n\n");
2829 f.write("QT_BEGIN_NAMESPACE\n\n");
2830 f.write("namespace QUnicodeTables {\n\n");
2831 f.write(properties);
2833 f.write(specialCases);
2835 f.write(compositions);
2838 f.write(normalizationCorrections);
2840 f.write(scriptTableDeclaration);
2841 f.write("} // namespace QUnicodeTables\n\n");
2842 f.write("using namespace QUnicodeTables;\n\n");
2843 f.write("QT_END_NAMESPACE\n");
2846 f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2847 f.open(QFile::WriteOnly | QFile::Truncate);
2851 f.write("#ifndef QUNICODETABLES_P_H\n"
2852 "#define QUNICODETABLES_P_H\n\n"
2853 "#include <QtCore/qchar.h>\n\n"
2854 "QT_BEGIN_NAMESPACE\n\n");
2855 f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
2856 f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n");
2857 f.write("namespace QUnicodeTables {\n\n");
2858 f.write(property_string);
2860 f.write(scriptEnumDeclaration);
2862 f.write(grapheme_break_string);
2864 f.write(word_break_string);
2866 f.write(sentence_break_string);
2868 f.write(line_break_class_string);
2871 f.write("} // namespace QUnicodeTables\n\n"
2872 "QT_END_NAMESPACE\n\n"
2873 "#endif // QUNICODETABLES_P_H\n");
2876 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff;
2877 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2878 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2879 qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2880 qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff;
2883 // dump(0x620, 0x640);
2884 // dump(0x10000, 0x10020);
2885 // dump(0x10800, 0x10820);
2887 qDebug("decompositionLength used:");
2888 int totalcompositions = 0;
2890 for (int i = 1; i < 20; ++i) {
2891 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
2892 totalcompositions += i*decompositionLength.value(i, 0);
2893 sum += decompositionLength.value(i, 0);
2895 qDebug(" len decomposition map %d, average length %f, num composed chars %d",
2896 totalcompositions, (float)totalcompositions/(float)sum, sum);
2897 qDebug("highest composed character %x", highestComposedCharacter);
2898 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2900 qBubbleSort(ligatures);
2901 for (int i = 0; i < ligatures.size(); ++i)
2902 qDebug("%s", ligatures.at(i).data());
2904 // qDebug("combiningClass usage:");
2905 // int numClasses = 0;
2906 // for (int i = 0; i < 255; ++i) {
2907 // int num = combiningClassUsage.value(i, 0);
2910 // qDebug(" combiningClass %d used %d times", i, num);
2913 // qDebug("total of %d combining classes used", numClasses);